Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
      7 
      8 #include <set>
      9 #include <vector>
     10 
     11 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
     12 
     13 #include "base/callback.h"
     14 #include "base/files/file_path.h"
     15 #include "base/files/scoped_file.h"
     16 
     17 // Implement SafeBrowsingStore in terms of a flat file.  The file
     18 // format is pretty literal:
     19 //
     20 // int32 magic;             // magic number "validating" file
     21 // int32 version;           // format version
     22 //
     23 // // Counts for the various data which follows the header.
     24 // uint32 add_chunk_count;  // Chunks seen, including empties.
     25 // uint32 sub_chunk_count;  // Ditto.
     26 // uint32 shard_stride;     // SBPrefix space covered per shard.
     27 //                          // 0==entire space in one shard.
     28 // // Sorted by chunk_id.
     29 // array[add_chunk_count] {
     30 //   int32 chunk_id;
     31 // }
     32 // // Sorted by chunk_id.
     33 // array[sub_chunk_count] {
     34 //   int32 chunk_id;
     35 // }
     36 // MD5Digest header_checksum;  // Checksum over preceeding data.
     37 //
     38 // // Sorted by prefix, then add chunk_id, then hash, both within shards and
     39 // // overall.
     40 // array[from 0 to wraparound to 0 by shard_stride] {
     41 //   uint32 add_prefix_count;
     42 //   uint32 sub_prefix_count;
     43 //   uint32 add_hash_count;
     44 //   uint32 sub_hash_count;
     45 //   array[add_prefix_count] {
     46 //     int32 chunk_id;
     47 //     uint32 prefix;
     48 //   }
     49 //   array[sub_prefix_count] {
     50 //     int32 chunk_id;
     51 //     int32 add_chunk_id;
     52 //     uint32 add_prefix;
     53 //   }
     54 //   array[add_hash_count] {
     55 //     int32 chunk_id;
     56 //     int32 received_time;     // From base::Time::ToTimeT().
     57 //     char[32] full_hash;
     58 //   }
     59 //   array[sub_hash_count] {
     60 //     int32 chunk_id;
     61 //     int32 add_chunk_id;
     62 //     char[32] add_full_hash;
     63 //   }
     64 // }
     65 // MD5Digest checksum;      // Checksum over entire file.
     66 //
     67 // The checksums are used to allow writing the file without doing an expensive
     68 // fsync().  Since the data can be re-fetched, failing the checksum is not
     69 // catastrophic.  Histograms indicate that file corruption here is pretty
     70 // uncommon.
     71 //
     72 // The |header_checksum| is present to guarantee valid header and chunk data for
     73 // updates.  Only that part of the file needs to be read to post the update.
     74 //
     75 // |shard_stride| breaks the file into approximately-equal portions, allowing
     76 // updates to stream from one file to another with modest memory usage.  It is
     77 // dynamic to adjust to different file sizes without adding excessive overhead.
     78 //
     79 // During the course of an update, uncommitted data is stored in a
     80 // temporary file (which is later re-used to commit).  This is an
     81 // array of chunks, with the count kept in memory until the end of the
     82 // transaction.  The format of this file is like the main file, with
     83 // the list of chunks seen omitted, as that data is tracked in-memory:
     84 //
     85 // array[] {
     86 //   uint32 add_prefix_count;
     87 //   uint32 sub_prefix_count;
     88 //   uint32 add_hash_count;
     89 //   uint32 sub_hash_count;
     90 //   array[add_prefix_count] {
     91 //     int32 chunk_id;
     92 //     uint32 prefix;
     93 //   }
     94 //   array[sub_prefix_count] {
     95 //     int32 chunk_id;
     96 //     int32 add_chunk_id;
     97 //     uint32 add_prefix;
     98 //   }
     99 //   array[add_hash_count] {
    100 //     int32 chunk_id;
    101 //     int32 received_time;     // From base::Time::ToTimeT().
    102 //     char[32] full_hash;
    103 //   }
    104 //   array[sub_hash_count] {
    105 //     int32 chunk_id;
    106 //     int32 add_chunk_id;
    107 //     char[32] add_full_hash;
    108 //   }
    109 // }
    110 //
    111 // The overall transaction works like this:
    112 // - Open the original file to get the chunks-seen data.
    113 // - Open a temp file for storing new chunk info.
    114 // - Write new chunks to the temp file.
    115 // - When the transaction is finished:
    116 //   - Read the update data from the temp file into memory.
    117 //   - Overwrite the temp file with new header data.
    118 //   - Until done:
    119 //     - Read shards of the original file's data into memory.
    120 //     - Merge from the update data.
    121 //     - Write shards to the temp file.
    122 //   - Delete original file.
    123 //   - Rename temp file to original filename.
    124 
    125 class SafeBrowsingStoreFile : public SafeBrowsingStore {
    126  public:
    127   SafeBrowsingStoreFile();
    128   virtual ~SafeBrowsingStoreFile();
    129 
    130   virtual void Init(const base::FilePath& filename,
    131                     const base::Closure& corruption_callback) OVERRIDE;
    132 
    133   // Delete any on-disk files, including the permanent storage.
    134   virtual bool Delete() OVERRIDE;
    135 
    136   // Get all add hash prefixes and full-length hashes, respectively, from
    137   // the store.
    138   virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE;
    139   virtual bool GetAddFullHashes(
    140       std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE;
    141 
    142   virtual bool BeginChunk() OVERRIDE;
    143 
    144   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE;
    145   virtual bool WriteAddHash(int32 chunk_id,
    146                             const SBFullHash& full_hash) OVERRIDE;
    147   virtual bool WriteSubPrefix(int32 chunk_id,
    148                               int32 add_chunk_id, SBPrefix prefix) OVERRIDE;
    149   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
    150                             const SBFullHash& full_hash) OVERRIDE;
    151   virtual bool FinishChunk() OVERRIDE;
    152 
    153   virtual bool BeginUpdate() OVERRIDE;
    154   virtual bool FinishUpdate(
    155       safe_browsing::PrefixSetBuilder* builder,
    156       std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE;
    157   virtual bool CancelUpdate() OVERRIDE;
    158 
    159   virtual void SetAddChunk(int32 chunk_id) OVERRIDE;
    160   virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE;
    161   virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE;
    162   virtual void SetSubChunk(int32 chunk_id) OVERRIDE;
    163   virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE;
    164   virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE;
    165 
    166   virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE;
    167   virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE;
    168 
    169   // Verify |file_|'s checksum, calling the corruption callback if it
    170   // does not check out.  Empty input is considered valid.
    171   virtual bool CheckValidity() OVERRIDE;
    172 
    173   // Returns the name of the temporary file used to buffer data for
    174   // |filename|.  Exported for unit tests.
    175   static const base::FilePath TemporaryFileForFilename(
    176       const base::FilePath& filename) {
    177     return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
    178   }
    179 
    180   // Delete any on-disk files, including the permanent storage.
    181   static bool DeleteStore(const base::FilePath& basename);
    182 
    183  private:
    184   // Does the actual update for FinishUpdate(), so that FinishUpdate() can clean
    185   // up correctly in case of error.
    186   virtual bool DoUpdate(safe_browsing::PrefixSetBuilder* builder,
    187                         std::vector<SBAddFullHash>* add_full_hashes_result);
    188 
    189   // Some very lucky users have an original-format file still in their
    190   // profile.  Check for it and delete, recording a histogram for the
    191   // result (no histogram for not-found).  Logically this
    192   // would make more sense at the SafeBrowsingDatabase level, but
    193   // practically speaking that code doesn't touch files directly.
    194   static void CheckForOriginalAndDelete(const base::FilePath& filename);
    195 
    196   // Close all files and clear all buffers.
    197   bool Close();
    198 
    199   // Calls |corruption_callback_| if non-NULL, always returns false as
    200   // a convenience to the caller.
    201   bool OnCorruptDatabase();
    202 
    203   // Helper for creating a corruption callback for |old_store_|.
    204   // TODO(shess): Remove after migration.
    205   void HandleCorruptDatabase();
    206 
    207   // Clear temporary buffers used to accumulate chunk data.
    208   bool ClearChunkBuffers() {
    209     // NOTE: .clear() doesn't release memory.
    210     // TODO(shess): Figure out if this is overkill.  Some amount of
    211     // pre-reserved space is probably reasonable between each chunk
    212     // collected.
    213     SBAddPrefixes().swap(add_prefixes_);
    214     SBSubPrefixes().swap(sub_prefixes_);
    215     std::vector<SBAddFullHash>().swap(add_hashes_);
    216     std::vector<SBSubFullHash>().swap(sub_hashes_);
    217     return true;
    218   }
    219 
    220   // Clear all buffers used during update.
    221   void ClearUpdateBuffers() {
    222     ClearChunkBuffers();
    223     chunks_written_ = 0;
    224     std::set<int32>().swap(add_chunks_cache_);
    225     std::set<int32>().swap(sub_chunks_cache_);
    226     base::hash_set<int32>().swap(add_del_cache_);
    227     base::hash_set<int32>().swap(sub_del_cache_);
    228   }
    229 
    230   // Buffers for collecting data between BeginChunk() and
    231   // FinishChunk().
    232   SBAddPrefixes add_prefixes_;
    233   SBSubPrefixes sub_prefixes_;
    234   std::vector<SBAddFullHash> add_hashes_;
    235   std::vector<SBSubFullHash> sub_hashes_;
    236 
    237   // Count of chunks collected in |new_file_|.
    238   int chunks_written_;
    239 
    240   // Name of the main database file.
    241   base::FilePath filename_;
    242 
    243   // Handles to the main and scratch files.  |empty_| is true if the
    244   // main file didn't exist when the update was started.
    245   base::ScopedFILE file_;
    246   base::ScopedFILE new_file_;
    247   bool empty_;
    248 
    249   // Cache of chunks which have been seen.  Loaded from the database
    250   // on BeginUpdate() so that it can be queried during the
    251   // transaction.
    252   std::set<int32> add_chunks_cache_;
    253   std::set<int32> sub_chunks_cache_;
    254 
    255   // Cache the set of deleted chunks during a transaction, applied on
    256   // FinishUpdate().
    257   // TODO(shess): If the set is small enough, hash_set<> might be
    258   // slower than plain set<>.
    259   base::hash_set<int32> add_del_cache_;
    260   base::hash_set<int32> sub_del_cache_;
    261 
    262   base::Closure corruption_callback_;
    263 
    264   // Tracks whether corruption has already been seen in the current
    265   // update, so that only one instance is recorded in the stats.
    266   // TODO(shess): Remove with format-migration support.
    267   bool corruption_seen_;
    268 
    269   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
    270 };
    271 
    272 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
    273