1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ 6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ 7 8 #include <set> 9 #include <vector> 10 11 #include "chrome/browser/safe_browsing/safe_browsing_store.h" 12 13 #include "base/callback.h" 14 #include "base/files/file_path.h" 15 #include "base/files/scoped_file.h" 16 17 // Implement SafeBrowsingStore in terms of a flat file. The file 18 // format is pretty literal: 19 // 20 // int32 magic; // magic number "validating" file 21 // int32 version; // format version 22 // 23 // // Counts for the various data which follows the header. 24 // uint32 add_chunk_count; // Chunks seen, including empties. 25 // uint32 sub_chunk_count; // Ditto. 26 // uint32 shard_stride; // SBPrefix space covered per shard. 27 // // 0==entire space in one shard. 28 // // Sorted by chunk_id. 29 // array[add_chunk_count] { 30 // int32 chunk_id; 31 // } 32 // // Sorted by chunk_id. 33 // array[sub_chunk_count] { 34 // int32 chunk_id; 35 // } 36 // MD5Digest header_checksum; // Checksum over preceeding data. 37 // 38 // // Sorted by prefix, then add chunk_id, then hash, both within shards and 39 // // overall. 40 // array[from 0 to wraparound to 0 by shard_stride] { 41 // uint32 add_prefix_count; 42 // uint32 sub_prefix_count; 43 // uint32 add_hash_count; 44 // uint32 sub_hash_count; 45 // array[add_prefix_count] { 46 // int32 chunk_id; 47 // uint32 prefix; 48 // } 49 // array[sub_prefix_count] { 50 // int32 chunk_id; 51 // int32 add_chunk_id; 52 // uint32 add_prefix; 53 // } 54 // array[add_hash_count] { 55 // int32 chunk_id; 56 // int32 received_time; // From base::Time::ToTimeT(). 57 // char[32] full_hash; 58 // } 59 // array[sub_hash_count] { 60 // int32 chunk_id; 61 // int32 add_chunk_id; 62 // char[32] add_full_hash; 63 // } 64 // } 65 // MD5Digest checksum; // Checksum over entire file. 66 // 67 // The checksums are used to allow writing the file without doing an expensive 68 // fsync(). Since the data can be re-fetched, failing the checksum is not 69 // catastrophic. Histograms indicate that file corruption here is pretty 70 // uncommon. 71 // 72 // The |header_checksum| is present to guarantee valid header and chunk data for 73 // updates. Only that part of the file needs to be read to post the update. 74 // 75 // |shard_stride| breaks the file into approximately-equal portions, allowing 76 // updates to stream from one file to another with modest memory usage. It is 77 // dynamic to adjust to different file sizes without adding excessive overhead. 78 // 79 // During the course of an update, uncommitted data is stored in a 80 // temporary file (which is later re-used to commit). This is an 81 // array of chunks, with the count kept in memory until the end of the 82 // transaction. The format of this file is like the main file, with 83 // the list of chunks seen omitted, as that data is tracked in-memory: 84 // 85 // array[] { 86 // uint32 add_prefix_count; 87 // uint32 sub_prefix_count; 88 // uint32 add_hash_count; 89 // uint32 sub_hash_count; 90 // array[add_prefix_count] { 91 // int32 chunk_id; 92 // uint32 prefix; 93 // } 94 // array[sub_prefix_count] { 95 // int32 chunk_id; 96 // int32 add_chunk_id; 97 // uint32 add_prefix; 98 // } 99 // array[add_hash_count] { 100 // int32 chunk_id; 101 // int32 received_time; // From base::Time::ToTimeT(). 102 // char[32] full_hash; 103 // } 104 // array[sub_hash_count] { 105 // int32 chunk_id; 106 // int32 add_chunk_id; 107 // char[32] add_full_hash; 108 // } 109 // } 110 // 111 // The overall transaction works like this: 112 // - Open the original file to get the chunks-seen data. 113 // - Open a temp file for storing new chunk info. 114 // - Write new chunks to the temp file. 115 // - When the transaction is finished: 116 // - Read the update data from the temp file into memory. 117 // - Overwrite the temp file with new header data. 118 // - Until done: 119 // - Read shards of the original file's data into memory. 120 // - Merge from the update data. 121 // - Write shards to the temp file. 122 // - Delete original file. 123 // - Rename temp file to original filename. 124 125 class SafeBrowsingStoreFile : public SafeBrowsingStore { 126 public: 127 SafeBrowsingStoreFile(); 128 virtual ~SafeBrowsingStoreFile(); 129 130 virtual void Init(const base::FilePath& filename, 131 const base::Closure& corruption_callback) OVERRIDE; 132 133 // Delete any on-disk files, including the permanent storage. 134 virtual bool Delete() OVERRIDE; 135 136 // Get all add hash prefixes and full-length hashes, respectively, from 137 // the store. 138 virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE; 139 virtual bool GetAddFullHashes( 140 std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE; 141 142 virtual bool BeginChunk() OVERRIDE; 143 144 virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE; 145 virtual bool WriteAddHash(int32 chunk_id, 146 const SBFullHash& full_hash) OVERRIDE; 147 virtual bool WriteSubPrefix(int32 chunk_id, 148 int32 add_chunk_id, SBPrefix prefix) OVERRIDE; 149 virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id, 150 const SBFullHash& full_hash) OVERRIDE; 151 virtual bool FinishChunk() OVERRIDE; 152 153 virtual bool BeginUpdate() OVERRIDE; 154 virtual bool FinishUpdate( 155 safe_browsing::PrefixSetBuilder* builder, 156 std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE; 157 virtual bool CancelUpdate() OVERRIDE; 158 159 virtual void SetAddChunk(int32 chunk_id) OVERRIDE; 160 virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE; 161 virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE; 162 virtual void SetSubChunk(int32 chunk_id) OVERRIDE; 163 virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE; 164 virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE; 165 166 virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE; 167 virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE; 168 169 // Verify |file_|'s checksum, calling the corruption callback if it 170 // does not check out. Empty input is considered valid. 171 virtual bool CheckValidity() OVERRIDE; 172 173 // Returns the name of the temporary file used to buffer data for 174 // |filename|. Exported for unit tests. 175 static const base::FilePath TemporaryFileForFilename( 176 const base::FilePath& filename) { 177 return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new")); 178 } 179 180 // Delete any on-disk files, including the permanent storage. 181 static bool DeleteStore(const base::FilePath& basename); 182 183 private: 184 // Does the actual update for FinishUpdate(), so that FinishUpdate() can clean 185 // up correctly in case of error. 186 virtual bool DoUpdate(safe_browsing::PrefixSetBuilder* builder, 187 std::vector<SBAddFullHash>* add_full_hashes_result); 188 189 // Some very lucky users have an original-format file still in their 190 // profile. Check for it and delete, recording a histogram for the 191 // result (no histogram for not-found). Logically this 192 // would make more sense at the SafeBrowsingDatabase level, but 193 // practically speaking that code doesn't touch files directly. 194 static void CheckForOriginalAndDelete(const base::FilePath& filename); 195 196 // Close all files and clear all buffers. 197 bool Close(); 198 199 // Calls |corruption_callback_| if non-NULL, always returns false as 200 // a convenience to the caller. 201 bool OnCorruptDatabase(); 202 203 // Helper for creating a corruption callback for |old_store_|. 204 // TODO(shess): Remove after migration. 205 void HandleCorruptDatabase(); 206 207 // Clear temporary buffers used to accumulate chunk data. 208 bool ClearChunkBuffers() { 209 // NOTE: .clear() doesn't release memory. 210 // TODO(shess): Figure out if this is overkill. Some amount of 211 // pre-reserved space is probably reasonable between each chunk 212 // collected. 213 SBAddPrefixes().swap(add_prefixes_); 214 SBSubPrefixes().swap(sub_prefixes_); 215 std::vector<SBAddFullHash>().swap(add_hashes_); 216 std::vector<SBSubFullHash>().swap(sub_hashes_); 217 return true; 218 } 219 220 // Clear all buffers used during update. 221 void ClearUpdateBuffers() { 222 ClearChunkBuffers(); 223 chunks_written_ = 0; 224 std::set<int32>().swap(add_chunks_cache_); 225 std::set<int32>().swap(sub_chunks_cache_); 226 base::hash_set<int32>().swap(add_del_cache_); 227 base::hash_set<int32>().swap(sub_del_cache_); 228 } 229 230 // Buffers for collecting data between BeginChunk() and 231 // FinishChunk(). 232 SBAddPrefixes add_prefixes_; 233 SBSubPrefixes sub_prefixes_; 234 std::vector<SBAddFullHash> add_hashes_; 235 std::vector<SBSubFullHash> sub_hashes_; 236 237 // Count of chunks collected in |new_file_|. 238 int chunks_written_; 239 240 // Name of the main database file. 241 base::FilePath filename_; 242 243 // Handles to the main and scratch files. |empty_| is true if the 244 // main file didn't exist when the update was started. 245 base::ScopedFILE file_; 246 base::ScopedFILE new_file_; 247 bool empty_; 248 249 // Cache of chunks which have been seen. Loaded from the database 250 // on BeginUpdate() so that it can be queried during the 251 // transaction. 252 std::set<int32> add_chunks_cache_; 253 std::set<int32> sub_chunks_cache_; 254 255 // Cache the set of deleted chunks during a transaction, applied on 256 // FinishUpdate(). 257 // TODO(shess): If the set is small enough, hash_set<> might be 258 // slower than plain set<>. 259 base::hash_set<int32> add_del_cache_; 260 base::hash_set<int32> sub_del_cache_; 261 262 base::Closure corruption_callback_; 263 264 // Tracks whether corruption has already been seen in the current 265 // update, so that only one instance is recorded in the stats. 266 // TODO(shess): Remove with format-migration support. 267 bool corruption_seen_; 268 269 DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile); 270 }; 271 272 #endif // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ 273