1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h" 6 7 #include "base/files/file_util.h" 8 #include "base/files/scoped_file.h" 9 #include "base/md5.h" 10 #include "base/metrics/histogram.h" 11 #include "base/metrics/sparse_histogram.h" 12 13 namespace { 14 15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so 16 // that byte-order changes force corruption. 17 const int32 kFileMagic = 0x600D71FE; 18 19 // Version history: 20 // Version 6: aad08754/r2814 by erikkay (at) google.com on 2008-10-02 (sqlite) 21 // Version 7: 6afe28a5/r37435 by shess (at) chromium.org on 2010-01-28 22 // Version 8: d3dd0715/r259791 by shess (at) chromium.org on 2014-03-27 23 const int32 kFileVersion = 8; 24 25 // ReadAndVerifyHeader() returns this in case of error. 26 const int32 kInvalidVersion = -1; 27 28 // Starting with version 8, the storage is sorted and can be sharded to allow 29 // updates to be done with lower memory requirements. Newly written files will 30 // be sharded to need less than this amount of memory during update. Larger 31 // values are preferred to minimize looping overhead during processing. 32 const int64 kUpdateStorageBytes = 100 * 1024; 33 34 // Prevent excessive sharding by setting a lower limit on the shard stride. 35 // Smaller values should work fine, but very small values will probably lead to 36 // poor performance. Shard stride is indirectly related to 37 // |kUpdateStorageBytes|, setting that very small will bump against this. 38 const uint32 kMinShardStride = 1 << 24; 39 40 // Strides over the entire SBPrefix space. 41 const uint64 kMaxShardStride = 1ULL << 32; 42 43 // Maximum SBPrefix value. 44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF; 45 46 // Header at the front of the main database file. 47 struct FileHeader { 48 int32 magic, version; 49 uint32 add_chunk_count, sub_chunk_count; 50 uint32 shard_stride; 51 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a 52 // specialized read/write? 53 }; 54 55 // Header for each chunk in the chunk-accumulation file. 56 struct ChunkHeader { 57 uint32 add_prefix_count, sub_prefix_count; 58 uint32 add_hash_count, sub_hash_count; 59 }; 60 61 // Header for each shard of data in the main database file. 62 struct ShardHeader { 63 uint32 add_prefix_count, sub_prefix_count; 64 uint32 add_hash_count, sub_hash_count; 65 }; 66 67 // Enumerate different format-change events for histogramming 68 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES. 69 enum FormatEventType { 70 // Corruption detected, broken down by file format. 71 FORMAT_EVENT_FILE_CORRUPT, 72 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete 73 74 // The type of format found in the file. The expected case (new 75 // file format) is intentionally not covered. 76 FORMAT_EVENT_FOUND_SQLITE, // Obsolete 77 FORMAT_EVENT_FOUND_UNKNOWN, // magic does not match. 78 79 // The number of SQLite-format files deleted should be the same as 80 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails, 81 // or if a failure prevents the update from succeeding. 82 FORMAT_EVENT_SQLITE_DELETED, // Obsolete 83 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete 84 85 // Found and deleted (or failed to delete) the ancient "Safe 86 // Browsing" file. 87 FORMAT_EVENT_DELETED_ORIGINAL, // Obsolete 88 FORMAT_EVENT_DELETED_ORIGINAL_FAILED, // Obsolete 89 90 // The checksum did not check out in CheckValidity() or in 91 // FinishUpdate(). This most likely indicates that the machine 92 // crashed before the file was fully sync'ed to disk. 93 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE, 94 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE, 95 96 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely 97 // indicates that the system crashed while writing an update. 98 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE, 99 100 FORMAT_EVENT_FOUND_DEPRECATED, // version too old. 101 102 // Memory space for histograms is determined by the max. ALWAYS 103 // ADD NEW VALUES BEFORE THIS ONE. 104 FORMAT_EVENT_MAX 105 }; 106 107 void RecordFormatEvent(FormatEventType event_type) { 108 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX); 109 } 110 111 // Rewind the file. Using fseek(2) because rewind(3) errors are 112 // weird. 113 bool FileRewind(FILE* fp) { 114 int rv = fseek(fp, 0, SEEK_SET); 115 DCHECK_EQ(rv, 0); 116 return rv == 0; 117 } 118 119 // Read from |fp| into |item|, and fold the input data into the 120 // checksum in |context|, if non-NULL. Return true on success. 121 template <class T> 122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) { 123 const size_t ret = fread(item, sizeof(T), 1, fp); 124 if (ret != 1) 125 return false; 126 127 if (context) { 128 base::MD5Update(context, 129 base::StringPiece(reinterpret_cast<char*>(item), 130 sizeof(T))); 131 } 132 return true; 133 } 134 135 // Write |item| to |fp|, and fold the output data into the checksum in 136 // |context|, if non-NULL. Return true on success. 137 template <class T> 138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) { 139 const size_t ret = fwrite(&item, sizeof(T), 1, fp); 140 if (ret != 1) 141 return false; 142 143 if (context) { 144 base::MD5Update(context, 145 base::StringPiece(reinterpret_cast<const char*>(&item), 146 sizeof(T))); 147 } 148 149 return true; 150 } 151 152 // Read |count| items into |values| from |fp|, and fold them into the 153 // checksum in |context|. Returns true on success. 154 template <typename CT> 155 bool ReadToContainer(CT* values, size_t count, FILE* fp, 156 base::MD5Context* context) { 157 if (!count) 158 return true; 159 160 for (size_t i = 0; i < count; ++i) { 161 typename CT::value_type value; 162 if (!ReadItem(&value, fp, context)) 163 return false; 164 165 // push_back() is more obvious, but coded this way std::set can 166 // also be read. 167 values->insert(values->end(), value); 168 } 169 170 return true; 171 } 172 173 // Write values between |beg| and |end| to |fp|, and fold the data into the 174 // checksum in |context|, if non-NULL. Returns true if all items successful. 175 template <typename CTI> 176 bool WriteRange(const CTI& beg, const CTI& end, 177 FILE* fp, base::MD5Context* context) { 178 for (CTI iter = beg; iter != end; ++iter) { 179 if (!WriteItem(*iter, fp, context)) 180 return false; 181 } 182 return true; 183 } 184 185 // Write all of |values| to |fp|, and fold the data into the checksum 186 // in |context|, if non-NULL. Returns true if all items successful. 187 template <typename CT> 188 bool WriteContainer(const CT& values, FILE* fp, 189 base::MD5Context* context) { 190 return WriteRange(values.begin(), values.end(), fp, context); 191 } 192 193 // Delete the chunks in |deleted| from |chunks|. 194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted, 195 std::set<int32>* chunks) { 196 for (std::set<int32>::iterator iter = chunks->begin(); 197 iter != chunks->end();) { 198 std::set<int32>::iterator prev = iter++; 199 if (deleted.count(*prev) > 0) 200 chunks->erase(prev); 201 } 202 } 203 204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) { 205 base::MD5Digest calculated_digest; 206 base::MD5IntermediateFinal(&calculated_digest, context); 207 208 base::MD5Digest file_digest; 209 if (!ReadItem(&file_digest, fp, context)) 210 return false; 211 212 return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0; 213 } 214 215 // Helper function to read the file header and chunk TOC. Rewinds |fp| and 216 // initializes |context|. The header is left in |header|, with the version 217 // returned. kInvalidVersion is returned for sanity check or checksum failure. 218 int ReadAndVerifyHeader(const base::FilePath& filename, 219 FileHeader* header, 220 std::set<int32>* add_chunks, 221 std::set<int32>* sub_chunks, 222 FILE* fp, 223 base::MD5Context* context) { 224 DCHECK(header); 225 DCHECK(add_chunks); 226 DCHECK(sub_chunks); 227 DCHECK(fp); 228 DCHECK(context); 229 230 base::MD5Init(context); 231 if (!FileRewind(fp)) 232 return kInvalidVersion; 233 if (!ReadItem(header, fp, context)) 234 return kInvalidVersion; 235 if (header->magic != kFileMagic) 236 return kInvalidVersion; 237 238 // Track version read to inform removal of support for older versions. 239 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version); 240 241 if (header->version != kFileVersion) 242 return kInvalidVersion; 243 244 if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) || 245 !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) { 246 return kInvalidVersion; 247 } 248 249 // Verify that the data read thus far is valid. 250 if (!ReadAndVerifyChecksum(fp, context)) { 251 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE); 252 return kInvalidVersion; 253 } 254 255 return kFileVersion; 256 } 257 258 // Helper function to write out the initial header and chunks-contained data. 259 // Rewinds |fp|, initializes |context|, then writes a file header and 260 // |add_chunks| and |sub_chunks|. 261 bool WriteHeader(uint32 out_stride, 262 const std::set<int32>& add_chunks, 263 const std::set<int32>& sub_chunks, 264 FILE* fp, 265 base::MD5Context* context) { 266 if (!FileRewind(fp)) 267 return false; 268 269 base::MD5Init(context); 270 FileHeader header; 271 header.magic = kFileMagic; 272 header.version = kFileVersion; 273 header.add_chunk_count = add_chunks.size(); 274 header.sub_chunk_count = sub_chunks.size(); 275 header.shard_stride = out_stride; 276 if (!WriteItem(header, fp, context)) 277 return false; 278 279 if (!WriteContainer(add_chunks, fp, context) || 280 !WriteContainer(sub_chunks, fp, context)) 281 return false; 282 283 // Write out the header digest. 284 base::MD5Digest header_digest; 285 base::MD5IntermediateFinal(&header_digest, context); 286 if (!WriteItem(header_digest, fp, context)) 287 return false; 288 289 return true; 290 } 291 292 // Return |true| if the range is sorted by the given comparator. 293 template <typename CTI, typename LESS> 294 bool sorted(CTI beg, CTI end, LESS less) { 295 while ((end - beg) > 2) { 296 CTI n = beg++; 297 DCHECK(!less(*beg, *n)); 298 if (less(*beg, *n)) 299 return false; 300 } 301 return true; 302 } 303 304 // Merge |beg|..|end| into |container|. Both should be sorted by the given 305 // comparator, and the range iterators should not be derived from |container|. 306 // Differs from std::inplace_merge() in that additional memory is not required 307 // for linear performance. 308 template <typename CT, typename CTI, typename COMP> 309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) { 310 DCHECK(sorted(container->begin(), container->end(), less)); 311 DCHECK(sorted(beg, end, less)); 312 313 // Size the container to fit the results. 314 const size_t c_size = container->size(); 315 container->resize(c_size + (end - beg)); 316 317 // |c_end| points to the original endpoint, while |c_out| points to the 318 // endpoint that will scan from end to beginning while merging. 319 typename CT::iterator c_end = container->begin() + c_size; 320 typename CT::iterator c_out = container->end(); 321 322 // While both inputs have data, move the greater to |c_out|. 323 while (c_end != container->begin() && end != beg) { 324 if (less(*(c_end - 1), *(end - 1))) { 325 *(--c_out) = *(--end); 326 } else { 327 *(--c_out) = *(--c_end); 328 } 329 } 330 331 // Copy any data remaining in the new range. 332 if (end != beg) { 333 // The original container data has been fully shifted. 334 DCHECK(c_end == container->begin()); 335 336 // There is exactly the correct amount of space left. 337 DCHECK_EQ(c_out - c_end, end - beg); 338 339 std::copy(beg, end, container->begin()); 340 } 341 342 DCHECK(sorted(container->begin(), container->end(), less)); 343 } 344 345 // Collection of iterators used while stepping through StateInternal (see 346 // below). 347 class StateInternalPos { 348 public: 349 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter, 350 SBSubPrefixes::iterator sub_prefixes_iter, 351 std::vector<SBAddFullHash>::iterator add_hashes_iter, 352 std::vector<SBSubFullHash>::iterator sub_hashes_iter) 353 : add_prefixes_iter_(add_prefixes_iter), 354 sub_prefixes_iter_(sub_prefixes_iter), 355 add_hashes_iter_(add_hashes_iter), 356 sub_hashes_iter_(sub_hashes_iter) { 357 } 358 359 SBAddPrefixes::iterator add_prefixes_iter_; 360 SBSubPrefixes::iterator sub_prefixes_iter_; 361 std::vector<SBAddFullHash>::iterator add_hashes_iter_; 362 std::vector<SBSubFullHash>::iterator sub_hashes_iter_; 363 }; 364 365 // Helper to find the next shard boundary. 366 template <class T> 367 bool prefix_bounder(SBPrefix val, const T& elt) { 368 return val < elt.GetAddPrefix(); 369 } 370 371 // Container for partial database state. Includes add/sub prefixes/hashes, plus 372 // aggregate operations on same. 373 class StateInternal { 374 public: 375 // Append indicated amount of data from |fp|. 376 bool AppendData(size_t add_prefix_count, size_t sub_prefix_count, 377 size_t add_hash_count, size_t sub_hash_count, 378 FILE* fp, base::MD5Context* context) { 379 return 380 ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) && 381 ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) && 382 ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) && 383 ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context); 384 } 385 386 void ClearData() { 387 add_prefixes_.clear(); 388 sub_prefixes_.clear(); 389 add_full_hashes_.clear(); 390 sub_full_hashes_.clear(); 391 } 392 393 // Merge data from |beg|..|end| into receiver's state, then process the state. 394 // The current state and the range given should corrospond to the same sorted 395 // shard of data from different sources. |add_del_cache| and |sub_del_cache| 396 // indicate the chunk ids which should be deleted during processing (see 397 // SBProcessSubs). 398 void MergeDataAndProcess(const StateInternalPos& beg, 399 const StateInternalPos& end, 400 const base::hash_set<int32>& add_del_cache, 401 const base::hash_set<int32>& sub_del_cache) { 402 container_merge(&add_prefixes_, 403 beg.add_prefixes_iter_, 404 end.add_prefixes_iter_, 405 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); 406 407 container_merge(&sub_prefixes_, 408 beg.sub_prefixes_iter_, 409 end.sub_prefixes_iter_, 410 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); 411 412 container_merge(&add_full_hashes_, 413 beg.add_hashes_iter_, 414 end.add_hashes_iter_, 415 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); 416 417 container_merge(&sub_full_hashes_, 418 beg.sub_hashes_iter_, 419 end.sub_hashes_iter_, 420 SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>); 421 422 SBProcessSubs(&add_prefixes_, &sub_prefixes_, 423 &add_full_hashes_, &sub_full_hashes_, 424 add_del_cache, sub_del_cache); 425 } 426 427 // Sort the data appropriately for the sharding, merging, and processing 428 // operations. 429 void SortData() { 430 std::sort(add_prefixes_.begin(), add_prefixes_.end(), 431 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); 432 std::sort(sub_prefixes_.begin(), sub_prefixes_.end(), 433 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); 434 std::sort(add_full_hashes_.begin(), add_full_hashes_.end(), 435 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); 436 std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(), 437 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); 438 } 439 440 // Iterator from the beginning of the state's data. 441 StateInternalPos StateBegin() { 442 return StateInternalPos(add_prefixes_.begin(), 443 sub_prefixes_.begin(), 444 add_full_hashes_.begin(), 445 sub_full_hashes_.begin()); 446 } 447 448 // An iterator pointing just after the last possible element of the shard 449 // indicated by |shard_max|. Used to step through the state by shard. 450 // TODO(shess): Verify whether binary search really improves over linear. 451 // Merging or writing will immediately touch all of these elements. 452 StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) { 453 return StateInternalPos( 454 std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(), 455 shard_max, prefix_bounder<SBAddPrefix>), 456 std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(), 457 shard_max, prefix_bounder<SBSubPrefix>), 458 std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(), 459 shard_max, prefix_bounder<SBAddFullHash>), 460 std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(), 461 shard_max, prefix_bounder<SBSubFullHash>)); 462 } 463 464 // Write a shard header and data for the shard starting at |beg| and ending at 465 // the element before |end|. 466 bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end, 467 FILE* fp, base::MD5Context* context) { 468 ShardHeader shard_header; 469 shard_header.add_prefix_count = 470 end.add_prefixes_iter_ - beg.add_prefixes_iter_; 471 shard_header.sub_prefix_count = 472 end.sub_prefixes_iter_ - beg.sub_prefixes_iter_; 473 shard_header.add_hash_count = 474 end.add_hashes_iter_ - beg.add_hashes_iter_; 475 shard_header.sub_hash_count = 476 end.sub_hashes_iter_ - beg.sub_hashes_iter_; 477 478 return 479 WriteItem(shard_header, fp, context) && 480 WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_, 481 fp, context) && 482 WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_, 483 fp, context) && 484 WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_, 485 fp, context) && 486 WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_, 487 fp, context); 488 } 489 490 SBAddPrefixes add_prefixes_; 491 SBSubPrefixes sub_prefixes_; 492 std::vector<SBAddFullHash> add_full_hashes_; 493 std::vector<SBSubFullHash> sub_full_hashes_; 494 }; 495 496 // True if |val| is an even power of two. 497 template <typename T> 498 bool IsPowerOfTwo(const T& val) { 499 return val && (val & (val - 1)) == 0; 500 } 501 502 // Helper to read the entire database state, used by GetAddPrefixes() and 503 // GetAddFullHashes(). Those functions are generally used only for smaller 504 // files. Returns false in case of errors reading the data. 505 bool ReadDbStateHelper(const base::FilePath& filename, 506 StateInternal* db_state) { 507 base::ScopedFILE file(base::OpenFile(filename, "rb")); 508 if (file.get() == NULL) 509 return false; 510 511 std::set<int32> add_chunks; 512 std::set<int32> sub_chunks; 513 514 base::MD5Context context; 515 FileHeader header; 516 const int version = 517 ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks, 518 file.get(), &context); 519 if (version == kInvalidVersion) 520 return false; 521 522 uint64 in_min = 0; 523 uint64 in_stride = header.shard_stride; 524 if (!in_stride) 525 in_stride = kMaxShardStride; 526 if (!IsPowerOfTwo(in_stride)) 527 return false; 528 529 do { 530 ShardHeader shard_header; 531 if (!ReadItem(&shard_header, file.get(), &context)) 532 return false; 533 534 if (!db_state->AppendData(shard_header.add_prefix_count, 535 shard_header.sub_prefix_count, 536 shard_header.add_hash_count, 537 shard_header.sub_hash_count, 538 file.get(), &context)) { 539 return false; 540 } 541 542 in_min += in_stride; 543 } while (in_min <= kMaxSBPrefix); 544 545 if (!ReadAndVerifyChecksum(file.get(), &context)) 546 return false; 547 548 int64 size = 0; 549 if (!base::GetFileSize(filename, &size)) 550 return false; 551 552 return static_cast<int64>(ftell(file.get())) == size; 553 } 554 555 } // namespace 556 557 SafeBrowsingStoreFile::SafeBrowsingStoreFile() 558 : chunks_written_(0), empty_(false), corruption_seen_(false) {} 559 560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() { 561 Close(); 562 } 563 564 bool SafeBrowsingStoreFile::Delete() { 565 // The database should not be open at this point. But, just in 566 // case, close everything before deleting. 567 if (!Close()) { 568 NOTREACHED(); 569 return false; 570 } 571 572 return DeleteStore(filename_); 573 } 574 575 bool SafeBrowsingStoreFile::CheckValidity() { 576 // The file was either empty or never opened. The empty case is 577 // presumed not to be invalid. The never-opened case can happen if 578 // BeginUpdate() fails for any databases, and should already have 579 // caused the corruption callback to fire. 580 if (!file_.get()) 581 return true; 582 583 if (!FileRewind(file_.get())) 584 return OnCorruptDatabase(); 585 586 int64 size = 0; 587 if (!base::GetFileSize(filename_, &size)) 588 return OnCorruptDatabase(); 589 590 base::MD5Context context; 591 base::MD5Init(&context); 592 593 // Read everything except the final digest. 594 size_t bytes_left = static_cast<size_t>(size); 595 CHECK(size == static_cast<int64>(bytes_left)); 596 if (bytes_left < sizeof(base::MD5Digest)) 597 return OnCorruptDatabase(); 598 bytes_left -= sizeof(base::MD5Digest); 599 600 // Fold the contents of the file into the checksum. 601 while (bytes_left > 0) { 602 char buf[4096]; 603 const size_t c = std::min(sizeof(buf), bytes_left); 604 const size_t ret = fread(buf, 1, c, file_.get()); 605 606 // The file's size changed while reading, give up. 607 if (ret != c) 608 return OnCorruptDatabase(); 609 base::MD5Update(&context, base::StringPiece(buf, c)); 610 bytes_left -= c; 611 } 612 613 if (!ReadAndVerifyChecksum(file_.get(), &context)) { 614 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE); 615 return OnCorruptDatabase(); 616 } 617 618 return true; 619 } 620 621 void SafeBrowsingStoreFile::Init( 622 const base::FilePath& filename, 623 const base::Closure& corruption_callback 624 ) { 625 filename_ = filename; 626 corruption_callback_ = corruption_callback; 627 } 628 629 bool SafeBrowsingStoreFile::BeginChunk() { 630 return ClearChunkBuffers(); 631 } 632 633 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) { 634 add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix)); 635 return true; 636 } 637 638 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) { 639 add_prefixes->clear(); 640 if (!base::PathExists(filename_)) 641 return true; 642 643 StateInternal db_state; 644 if (!ReadDbStateHelper(filename_, &db_state)) 645 return OnCorruptDatabase(); 646 647 add_prefixes->swap(db_state.add_prefixes_); 648 return true; 649 } 650 651 bool SafeBrowsingStoreFile::GetAddFullHashes( 652 std::vector<SBAddFullHash>* add_full_hashes) { 653 add_full_hashes->clear(); 654 if (!base::PathExists(filename_)) 655 return true; 656 657 StateInternal db_state; 658 if (!ReadDbStateHelper(filename_, &db_state)) 659 return OnCorruptDatabase(); 660 661 add_full_hashes->swap(db_state.add_full_hashes_); 662 return true; 663 } 664 665 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id, 666 const SBFullHash& full_hash) { 667 add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash)); 668 return true; 669 } 670 671 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id, 672 int32 add_chunk_id, 673 SBPrefix prefix) { 674 sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix)); 675 return true; 676 } 677 678 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id, 679 const SBFullHash& full_hash) { 680 sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash)); 681 return true; 682 } 683 684 bool SafeBrowsingStoreFile::OnCorruptDatabase() { 685 if (!corruption_seen_) 686 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT); 687 corruption_seen_ = true; 688 689 corruption_callback_.Run(); 690 691 // Return false as a convenience to callers. 692 return false; 693 } 694 695 bool SafeBrowsingStoreFile::Close() { 696 ClearUpdateBuffers(); 697 698 // Make sure the files are closed. 699 file_.reset(); 700 new_file_.reset(); 701 return true; 702 } 703 704 bool SafeBrowsingStoreFile::BeginUpdate() { 705 DCHECK(!file_.get() && !new_file_.get()); 706 707 // Structures should all be clear unless something bad happened. 708 DCHECK(add_chunks_cache_.empty()); 709 DCHECK(sub_chunks_cache_.empty()); 710 DCHECK(add_del_cache_.empty()); 711 DCHECK(sub_del_cache_.empty()); 712 DCHECK(add_prefixes_.empty()); 713 DCHECK(sub_prefixes_.empty()); 714 DCHECK(add_hashes_.empty()); 715 DCHECK(sub_hashes_.empty()); 716 DCHECK_EQ(chunks_written_, 0); 717 718 corruption_seen_ = false; 719 720 const base::FilePath new_filename = TemporaryFileForFilename(filename_); 721 base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+")); 722 if (new_file.get() == NULL) 723 return false; 724 725 base::ScopedFILE file(base::OpenFile(filename_, "rb")); 726 empty_ = (file.get() == NULL); 727 if (empty_) { 728 // If the file exists but cannot be opened, try to delete it (not 729 // deleting directly, the bloom filter needs to be deleted, too). 730 if (base::PathExists(filename_)) 731 return OnCorruptDatabase(); 732 733 new_file_.swap(new_file); 734 return true; 735 } 736 737 base::MD5Context context; 738 FileHeader header; 739 const int version = 740 ReadAndVerifyHeader(filename_, &header, 741 &add_chunks_cache_, &sub_chunks_cache_, 742 file.get(), &context); 743 if (version == kInvalidVersion) { 744 FileHeader retry_header; 745 if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) { 746 if (retry_header.magic == kFileMagic && 747 retry_header.version < kFileVersion) { 748 RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED); 749 } else { 750 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN); 751 } 752 } 753 754 // Close the file so that it can be deleted. 755 file.reset(); 756 757 return OnCorruptDatabase(); 758 } 759 760 file_.swap(file); 761 new_file_.swap(new_file); 762 return true; 763 } 764 765 bool SafeBrowsingStoreFile::FinishChunk() { 766 if (!add_prefixes_.size() && !sub_prefixes_.size() && 767 !add_hashes_.size() && !sub_hashes_.size()) 768 return true; 769 770 ChunkHeader header; 771 header.add_prefix_count = add_prefixes_.size(); 772 header.sub_prefix_count = sub_prefixes_.size(); 773 header.add_hash_count = add_hashes_.size(); 774 header.sub_hash_count = sub_hashes_.size(); 775 if (!WriteItem(header, new_file_.get(), NULL)) 776 return false; 777 778 if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) || 779 !WriteContainer(sub_prefixes_, new_file_.get(), NULL) || 780 !WriteContainer(add_hashes_, new_file_.get(), NULL) || 781 !WriteContainer(sub_hashes_, new_file_.get(), NULL)) 782 return false; 783 784 ++chunks_written_; 785 786 // Clear everything to save memory. 787 return ClearChunkBuffers(); 788 } 789 790 bool SafeBrowsingStoreFile::DoUpdate( 791 safe_browsing::PrefixSetBuilder* builder, 792 std::vector<SBAddFullHash>* add_full_hashes_result) { 793 DCHECK(file_.get() || empty_); 794 DCHECK(new_file_.get()); 795 CHECK(builder); 796 CHECK(add_full_hashes_result); 797 798 // Rewind the temporary storage. 799 if (!FileRewind(new_file_.get())) 800 return false; 801 802 // Get chunk file's size for validating counts. 803 int64 update_size = 0; 804 if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size)) 805 return OnCorruptDatabase(); 806 807 // Track update size to answer questions at http://crbug.com/72216 . 808 // Log small updates as 1k so that the 0 (underflow) bucket can be 809 // used for "empty" in SafeBrowsingDatabase. 810 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes", 811 std::max(static_cast<int>(update_size / 1024), 1)); 812 813 // Chunk updates to integrate. 814 StateInternal new_state; 815 816 // Read update chunks. 817 for (int i = 0; i < chunks_written_; ++i) { 818 ChunkHeader header; 819 820 int64 ofs = ftell(new_file_.get()); 821 if (ofs == -1) 822 return false; 823 824 if (!ReadItem(&header, new_file_.get(), NULL)) 825 return false; 826 827 // As a safety measure, make sure that the header describes a sane 828 // chunk, given the remaining file size. 829 int64 expected_size = ofs + sizeof(ChunkHeader); 830 expected_size += header.add_prefix_count * sizeof(SBAddPrefix); 831 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); 832 expected_size += header.add_hash_count * sizeof(SBAddFullHash); 833 expected_size += header.sub_hash_count * sizeof(SBSubFullHash); 834 if (expected_size > update_size) 835 return false; 836 837 if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count, 838 header.add_hash_count, header.sub_hash_count, 839 new_file_.get(), NULL)) { 840 return false; 841 } 842 } 843 844 // The state was accumulated by chunk, sort by prefix. 845 new_state.SortData(); 846 847 // These strides control how much data is loaded into memory per pass. 848 // Strides must be an even power of two. |in_stride| will be derived from the 849 // input file. |out_stride| will be derived from an estimate of the resulting 850 // file's size. |process_stride| will be the max of both. 851 uint64 in_stride = kMaxShardStride; 852 uint64 out_stride = kMaxShardStride; 853 uint64 process_stride = 0; 854 855 // Used to verify the input's checksum if |!empty_|. 856 base::MD5Context in_context; 857 858 if (!empty_) { 859 DCHECK(file_.get()); 860 861 FileHeader header = {0}; 862 int version = ReadAndVerifyHeader(filename_, &header, 863 &add_chunks_cache_, &sub_chunks_cache_, 864 file_.get(), &in_context); 865 if (version == kInvalidVersion) 866 return OnCorruptDatabase(); 867 868 if (header.shard_stride) 869 in_stride = header.shard_stride; 870 871 // The header checksum should have prevented this case, but the code will be 872 // broken if this is not correct. 873 if (!IsPowerOfTwo(in_stride)) 874 return OnCorruptDatabase(); 875 } 876 877 // We no longer need to track deleted chunks. 878 DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_); 879 DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_); 880 881 // Calculate |out_stride| to break the file down into reasonable shards. 882 { 883 int64 original_size = 0; 884 if (!empty_ && !base::GetFileSize(filename_, &original_size)) 885 return OnCorruptDatabase(); 886 887 // Approximate the final size as everything. Subs and deletes will reduce 888 // the size, but modest over-sharding won't hurt much. 889 int64 shard_size = original_size + update_size; 890 891 // Keep splitting until a single stride of data fits the target. 892 size_t shifts = 0; 893 while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) { 894 out_stride >>= 1; 895 shard_size >>= 1; 896 ++shifts; 897 } 898 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts); 899 900 DCHECK(IsPowerOfTwo(out_stride)); 901 } 902 903 // Outer loop strides by the max of the input stride (to read integral shards) 904 // and the output stride (to write integral shards). 905 process_stride = std::max(in_stride, out_stride); 906 DCHECK(IsPowerOfTwo(process_stride)); 907 DCHECK_EQ(0u, process_stride % in_stride); 908 DCHECK_EQ(0u, process_stride % out_stride); 909 910 // Start writing the new data to |new_file_|. 911 base::MD5Context out_context; 912 if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_, 913 new_file_.get(), &out_context)) { 914 return false; 915 } 916 917 // Start at the beginning of the SBPrefix space. 918 uint64 in_min = 0; 919 uint64 out_min = 0; 920 uint64 process_min = 0; 921 922 // Start at the beginning of the updates. 923 StateInternalPos new_pos = new_state.StateBegin(); 924 925 // Re-usable container for shard processing. 926 StateInternal db_state; 927 928 // Track aggregate counts for histograms. 929 size_t add_prefix_count = 0; 930 size_t sub_prefix_count = 0; 931 932 do { 933 // Maximum element in the current shard. 934 SBPrefix process_max = 935 static_cast<SBPrefix>(process_min + process_stride - 1); 936 DCHECK_GT(process_max, process_min); 937 938 // Drop the data from previous pass. 939 db_state.ClearData(); 940 941 // Fill the processing shard with one or more input shards. 942 if (!empty_) { 943 do { 944 ShardHeader shard_header; 945 if (!ReadItem(&shard_header, file_.get(), &in_context)) 946 return OnCorruptDatabase(); 947 948 if (!db_state.AppendData(shard_header.add_prefix_count, 949 shard_header.sub_prefix_count, 950 shard_header.add_hash_count, 951 shard_header.sub_hash_count, 952 file_.get(), &in_context)) 953 return OnCorruptDatabase(); 954 955 in_min += in_stride; 956 } while (in_min <= kMaxSBPrefix && in_min < process_max); 957 } 958 959 // Shard the update data to match the database data, then merge the update 960 // data and process the results. 961 { 962 StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max); 963 db_state.MergeDataAndProcess(new_pos, new_end, 964 add_del_cache_, sub_del_cache_); 965 new_pos = new_end; 966 } 967 968 // Collect the processed data for return to caller. 969 for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) { 970 builder->AddPrefix(db_state.add_prefixes_[i].prefix); 971 } 972 add_full_hashes_result->insert(add_full_hashes_result->end(), 973 db_state.add_full_hashes_.begin(), 974 db_state.add_full_hashes_.end()); 975 add_prefix_count += db_state.add_prefixes_.size(); 976 sub_prefix_count += db_state.sub_prefixes_.size(); 977 978 // Write one or more shards of processed output. 979 StateInternalPos out_pos = db_state.StateBegin(); 980 do { 981 SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1); 982 DCHECK_GT(out_max, out_min); 983 984 StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max); 985 if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context)) 986 return false; 987 out_pos = out_end; 988 989 out_min += out_stride; 990 } while (out_min == static_cast<SBPrefix>(out_min) && 991 out_min < process_max); 992 993 process_min += process_stride; 994 } while (process_min <= kMaxSBPrefix); 995 996 // Verify the overall checksum. 997 if (!empty_) { 998 if (!ReadAndVerifyChecksum(file_.get(), &in_context)) { 999 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE); 1000 return OnCorruptDatabase(); 1001 } 1002 1003 // TODO(shess): Verify EOF? 1004 1005 // Close the input file so the new file can be renamed over it. 1006 file_.reset(); 1007 } 1008 DCHECK(!file_.get()); 1009 1010 // Write the overall checksum. 1011 base::MD5Digest out_digest; 1012 base::MD5Final(&out_digest, &out_context); 1013 if (!WriteItem(out_digest, new_file_.get(), NULL)) 1014 return false; 1015 1016 // Trim any excess left over from the temporary chunk data. 1017 if (!base::TruncateFile(new_file_.get())) 1018 return false; 1019 1020 // Close the file handle and swizzle the file into place. 1021 new_file_.reset(); 1022 if (!base::DeleteFile(filename_, false) && 1023 base::PathExists(filename_)) 1024 return false; 1025 1026 const base::FilePath new_filename = TemporaryFileForFilename(filename_); 1027 if (!base::Move(new_filename, filename_)) 1028 return false; 1029 1030 // Record counts before swapping to caller. 1031 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count); 1032 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count); 1033 1034 return true; 1035 } 1036 1037 bool SafeBrowsingStoreFile::FinishUpdate( 1038 safe_browsing::PrefixSetBuilder* builder, 1039 std::vector<SBAddFullHash>* add_full_hashes_result) { 1040 DCHECK(builder); 1041 DCHECK(add_full_hashes_result); 1042 1043 if (!DoUpdate(builder, add_full_hashes_result)) { 1044 CancelUpdate(); 1045 return false; 1046 } 1047 1048 DCHECK(!new_file_.get()); 1049 DCHECK(!file_.get()); 1050 1051 return Close(); 1052 } 1053 1054 bool SafeBrowsingStoreFile::CancelUpdate() { 1055 bool ret = Close(); 1056 1057 // Delete stale staging file. 1058 const base::FilePath new_filename = TemporaryFileForFilename(filename_); 1059 base::DeleteFile(new_filename, false); 1060 1061 return ret; 1062 } 1063 1064 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) { 1065 add_chunks_cache_.insert(chunk_id); 1066 } 1067 1068 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) { 1069 return add_chunks_cache_.count(chunk_id) > 0; 1070 } 1071 1072 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) { 1073 out->clear(); 1074 out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end()); 1075 } 1076 1077 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) { 1078 sub_chunks_cache_.insert(chunk_id); 1079 } 1080 1081 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) { 1082 return sub_chunks_cache_.count(chunk_id) > 0; 1083 } 1084 1085 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) { 1086 out->clear(); 1087 out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end()); 1088 } 1089 1090 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) { 1091 add_del_cache_.insert(chunk_id); 1092 } 1093 1094 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) { 1095 sub_del_cache_.insert(chunk_id); 1096 } 1097 1098 // static 1099 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) { 1100 if (!base::DeleteFile(basename, false) && 1101 base::PathExists(basename)) { 1102 NOTREACHED(); 1103 return false; 1104 } 1105 1106 const base::FilePath new_filename = TemporaryFileForFilename(basename); 1107 if (!base::DeleteFile(new_filename, false) && 1108 base::PathExists(new_filename)) { 1109 NOTREACHED(); 1110 return false; 1111 } 1112 1113 // With SQLite support gone, one way to get to this code is if the 1114 // existing file is a SQLite file. Make sure the journal file is 1115 // also removed. 1116 const base::FilePath journal_filename( 1117 basename.value() + FILE_PATH_LITERAL("-journal")); 1118 if (base::PathExists(journal_filename)) 1119 base::DeleteFile(journal_filename, false); 1120 1121 return true; 1122 } 1123