Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
      7 #pragma once
      8 
      9 #include <set>
     10 #include <vector>
     11 
     12 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
     13 
     14 #include "base/callback.h"
     15 #include "base/file_util.h"
     16 
     17 // Implement SafeBrowsingStore in terms of a flat file.  The file
     18 // format is pretty literal:
     19 //
     20 // int32 magic;             // magic number "validating" file
     21 // int32 version;           // format version
     22 //
     23 // // Counts for the various data which follows the header.
     24 // uint32 add_chunk_count;   // Chunks seen, including empties.
     25 // uint32 sub_chunk_count;   // Ditto.
     26 // uint32 add_prefix_count;
     27 // uint32 sub_prefix_count;
     28 // uint32 add_hash_count;
     29 // uint32 sub_hash_count;
     30 //
     31 // array[add_chunk_count] {
     32 //   int32 chunk_id;
     33 // }
     34 // array[sub_chunk_count] {
     35 //   int32 chunk_id;
     36 // }
     37 // array[add_prefix_count] {
     38 //   int32 chunk_id;
     39 //   int32 prefix;
     40 // }
     41 // array[sub_prefix_count] {
     42 //   int32 chunk_id;
     43 //   int32 add_chunk_id;
     44 //   int32 add_prefix;
     45 // }
     46 // array[add_hash_count] {
     47 //   int32 chunk_id;
     48 //   int32 received_time;     // From base::Time::ToTimeT().
     49 //   char[32] full_hash;
     50 // array[sub_hash_count] {
     51 //   int32 chunk_id;
     52 //   int32 add_chunk_id;
     53 //   char[32] add_full_hash;
     54 // }
     55 // MD5Digest checksum;      // Checksum over preceeding data.
     56 //
     57 // During the course of an update, uncommitted data is stored in a
     58 // temporary file (which is later re-used to commit).  This is an
     59 // array of chunks, with the count kept in memory until the end of the
     60 // transaction.  The format of this file is like the main file, with
     61 // the list of chunks seen omitted, as that data is tracked in-memory:
     62 //
     63 // array[] {
     64 //   uint32 add_prefix_count;
     65 //   uint32 sub_prefix_count;
     66 //   uint32 add_hash_count;
     67 //   uint32 sub_hash_count;
     68 //   array[add_prefix_count] {
     69 //     int32 chunk_id;
     70 //     int32 prefix;
     71 //   }
     72 //   array[sub_prefix_count] {
     73 //     int32 chunk_id;
     74 //     int32 add_chunk_id;
     75 //     int32 add_prefix;
     76 //   }
     77 //   array[add_hash_count] {
     78 //     int32 chunk_id;
     79 //     int32 received_time;     // From base::Time::ToTimeT().
     80 //     char[32] full_hash;
     81 //   }
     82 //   array[sub_hash_count] {
     83 //     int32 chunk_id;
     84 //     int32 add_chunk_id;
     85 //     char[32] add_full_hash;
     86 //   }
     87 // }
     88 //
     89 // The overall transaction works like this:
     90 // - Open the original file to get the chunks-seen data.
     91 // - Open a temp file for storing new chunk info.
     92 // - Write new chunks to the temp file.
     93 // - When the transaction is finished:
     94 //   - Read the rest of the original file's data into buffers.
     95 //   - Rewind the temp file and merge the new data into buffers.
     96 //   - Process buffers for deletions and apply subs.
     97 //   - Rewind and write the buffers out to temp file.
     98 //   - Delete original file.
     99 //   - Rename temp file to original filename.
    100 
    101 // TODO(shess): By using a checksum, this code can avoid doing an
    102 // fsync(), at the possible cost of more frequently retrieving the
    103 // full dataset.  Measure how often this occurs, and if it occurs too
    104 // often, consider retaining the last known-good file for recovery
    105 // purposes, rather than deleting it.
    106 
    107 class SafeBrowsingStoreFile : public SafeBrowsingStore {
    108  public:
    109   SafeBrowsingStoreFile();
    110   virtual ~SafeBrowsingStoreFile();
    111 
    112   virtual void Init(const FilePath& filename,
    113                     Callback0::Type* corruption_callback);
    114 
    115   // Delete any on-disk files, including the permanent storage.
    116   virtual bool Delete();
    117 
    118   // Get all add hash prefixes and full-length hashes, respectively, from
    119   // the store.
    120   virtual bool GetAddPrefixes(std::vector<SBAddPrefix>* add_prefixes);
    121   virtual bool GetAddFullHashes(std::vector<SBAddFullHash>* add_full_hashes);
    122 
    123   virtual bool BeginChunk();
    124 
    125   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix);
    126   virtual bool WriteAddHash(int32 chunk_id,
    127                             base::Time receive_time,
    128                             const SBFullHash& full_hash);
    129   virtual bool WriteSubPrefix(int32 chunk_id,
    130                               int32 add_chunk_id, SBPrefix prefix);
    131   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
    132                             const SBFullHash& full_hash);
    133   virtual bool FinishChunk();
    134 
    135   virtual bool BeginUpdate();
    136   // Store updates with pending add full hashes in file store and
    137   // return |add_prefixes_result| and |add_full_hashes_result|.
    138   virtual bool FinishUpdate(const std::vector<SBAddFullHash>& pending_adds,
    139                             const std::set<SBPrefix>& prefix_misses,
    140                             std::vector<SBAddPrefix>* add_prefixes_result,
    141                             std::vector<SBAddFullHash>* add_full_hashes_result);
    142   virtual bool CancelUpdate();
    143 
    144   virtual void SetAddChunk(int32 chunk_id);
    145   virtual bool CheckAddChunk(int32 chunk_id);
    146   virtual void GetAddChunks(std::vector<int32>* out);
    147   virtual void SetSubChunk(int32 chunk_id);
    148   virtual bool CheckSubChunk(int32 chunk_id);
    149   virtual void GetSubChunks(std::vector<int32>* out);
    150 
    151   virtual void DeleteAddChunk(int32 chunk_id);
    152   virtual void DeleteSubChunk(int32 chunk_id);
    153 
    154   // Returns the name of the temporary file used to buffer data for
    155   // |filename|.  Exported for unit tests.
    156   static const FilePath TemporaryFileForFilename(const FilePath& filename) {
    157     return FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
    158   }
    159 
    160  private:
    161   // Update store file with pending full hashes.
    162   virtual bool DoUpdate(const std::vector<SBAddFullHash>& pending_adds,
    163                         const std::set<SBPrefix>& prefix_misses,
    164                         std::vector<SBAddPrefix>* add_prefixes_result,
    165                         std::vector<SBAddFullHash>* add_full_hashes_result);
    166 
    167   // Enumerate different format-change events for histogramming
    168   // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
    169   // TODO(shess): Remove this once the format change is complete.
    170   enum FormatEventType {
    171     // Corruption detected, broken down by file format.
    172     FORMAT_EVENT_FILE_CORRUPT,
    173     FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
    174 
    175     // The type of format found in the file.  The expected case (new
    176     // file format) is intentionally not covered.
    177     FORMAT_EVENT_FOUND_SQLITE,
    178     FORMAT_EVENT_FOUND_UNKNOWN,
    179 
    180     // The number of SQLite-format files deleted should be the same as
    181     // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
    182     // or if a failure prevents the update from succeeding.
    183     FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
    184     FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
    185 
    186     // Found and deleted (or failed to delete) the ancient "Safe
    187     // Browsing" file.
    188     FORMAT_EVENT_DELETED_ORIGINAL,
    189     FORMAT_EVENT_DELETED_ORIGINAL_FAILED,
    190 
    191     // Memory space for histograms is determined by the max.  ALWAYS
    192     // ADD NEW VALUES BEFORE THIS ONE.
    193     FORMAT_EVENT_MAX
    194   };
    195 
    196   // Helper to record an event related to format conversion from
    197   // SQLite to file.
    198   static void RecordFormatEvent(FormatEventType event_type);
    199 
    200   // Some very lucky users have an original-format file still in their
    201   // profile.  Check for it and delete, recording a histogram for the
    202   // result (no histogram for not-found).  Logically this
    203   // would make more sense at the SafeBrowsingDatabase level, but
    204   // practically speaking that code doesn't touch files directly.
    205   static void CheckForOriginalAndDelete(const FilePath& filename);
    206 
    207   // Close all files and clear all buffers.
    208   bool Close();
    209 
    210   // Calls |corruption_callback_| if non-NULL, always returns false as
    211   // a convenience to the caller.
    212   bool OnCorruptDatabase();
    213 
    214   // Helper for creating a corruption callback for |old_store_|.
    215   // TODO(shess): Remove after migration.
    216   void HandleCorruptDatabase();
    217 
    218   // Clear temporary buffers used to accumulate chunk data.
    219   bool ClearChunkBuffers() {
    220     // NOTE: .clear() doesn't release memory.
    221     // TODO(shess): Figure out if this is overkill.  Some amount of
    222     // pre-reserved space is probably reasonable between each chunk
    223     // collected.
    224     std::vector<SBAddPrefix>().swap(add_prefixes_);
    225     std::vector<SBSubPrefix>().swap(sub_prefixes_);
    226     std::vector<SBAddFullHash>().swap(add_hashes_);
    227     std::vector<SBSubFullHash>().swap(sub_hashes_);
    228     return true;
    229   }
    230 
    231   // Clear all buffers used during update.
    232   void ClearUpdateBuffers() {
    233     ClearChunkBuffers();
    234     chunks_written_ = 0;
    235     std::set<int32>().swap(add_chunks_cache_);
    236     std::set<int32>().swap(sub_chunks_cache_);
    237     base::hash_set<int32>().swap(add_del_cache_);
    238     base::hash_set<int32>().swap(sub_del_cache_);
    239   }
    240 
    241   // Buffers for collecting data between BeginChunk() and
    242   // FinishChunk().
    243   std::vector<SBAddPrefix> add_prefixes_;
    244   std::vector<SBSubPrefix> sub_prefixes_;
    245   std::vector<SBAddFullHash> add_hashes_;
    246   std::vector<SBSubFullHash> sub_hashes_;
    247 
    248   // Count of chunks collected in |new_file_|.
    249   int chunks_written_;
    250 
    251   // Name of the main database file.
    252   FilePath filename_;
    253 
    254   // Handles to the main and scratch files.  |empty_| is true if the
    255   // main file didn't exist when the update was started.
    256   file_util::ScopedFILE file_;
    257   file_util::ScopedFILE new_file_;
    258   bool empty_;
    259 
    260   // Cache of chunks which have been seen.  Loaded from the database
    261   // on BeginUpdate() so that it can be queried during the
    262   // transaction.
    263   std::set<int32> add_chunks_cache_;
    264   std::set<int32> sub_chunks_cache_;
    265 
    266   // Cache the set of deleted chunks during a transaction, applied on
    267   // FinishUpdate().
    268   // TODO(shess): If the set is small enough, hash_set<> might be
    269   // slower than plain set<>.
    270   base::hash_set<int32> add_del_cache_;
    271   base::hash_set<int32> sub_del_cache_;
    272 
    273   scoped_ptr<Callback0::Type> corruption_callback_;
    274 
    275   // Tracks whether corruption has already been seen in the current
    276   // update, so that only one instance is recorded in the stats.
    277   // TODO(shess): Remove with format-migration support.
    278   bool corruption_seen_;
    279 
    280   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
    281 };
    282 
    283 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
    284