1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h" 6 7 #include "base/file_util.h" 8 #include "base/files/scoped_file.h" 9 #include "base/md5.h" 10 #include "base/metrics/histogram.h" 11 #include "base/metrics/sparse_histogram.h" 12 13 namespace { 14 15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so 16 // that byte-order changes force corruption. 17 const int32 kFileMagic = 0x600D71FE; 18 19 // Version history: 20 // Version 6: aad08754/r2814 by erikkay (at) google.com on 2008-10-02 (sqlite) 21 // Version 7: 6afe28a5/r37435 by shess (at) chromium.org on 2010-01-28 22 // Version 8: d3dd0715/r259791 by shess (at) chromium.org on 2014-03-27 23 const int32 kFileVersion = 8; 24 25 // ReadAndVerifyHeader() returns this in case of error. 26 const int32 kInvalidVersion = -1; 27 28 // Header at the front of the main database file. 29 struct FileHeaderV7 { 30 int32 magic, version; 31 uint32 add_chunk_count, sub_chunk_count; 32 uint32 add_prefix_count, sub_prefix_count; 33 uint32 add_hash_count, sub_hash_count; 34 }; 35 36 // Starting with version 8, the storage is sorted and can be sharded to allow 37 // updates to be done with lower memory requirements. Newly written files will 38 // be sharded to need less than this amount of memory during update. Larger 39 // values are preferred to minimize looping overhead during processing. 40 const int64 kUpdateStorageBytes = 100 * 1024; 41 42 // Prevent excessive sharding by setting a lower limit on the shard stride. 43 // Smaller values should work fine, but very small values will probably lead to 44 // poor performance. Shard stride is indirectly related to 45 // |kUpdateStorageBytes|, setting that very small will bump against this. 46 const uint32 kMinShardStride = 1 << 24; 47 48 // Strides over the entire SBPrefix space. 49 const uint64 kMaxShardStride = 1ULL << 32; 50 51 // Maximum SBPrefix value. 52 const SBPrefix kMaxSBPrefix = ~0; 53 54 // Header at the front of the main database file. 55 struct FileHeaderV8 { 56 int32 magic, version; 57 uint32 add_chunk_count, sub_chunk_count; 58 uint32 shard_stride; 59 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a 60 // specialized read/write? 61 }; 62 63 union FileHeader { 64 struct FileHeaderV7 v7; 65 struct FileHeaderV8 v8; 66 }; 67 68 // Header for each chunk in the chunk-accumulation file. 69 struct ChunkHeader { 70 uint32 add_prefix_count, sub_prefix_count; 71 uint32 add_hash_count, sub_hash_count; 72 }; 73 74 // Header for each shard of data in the main database file. 75 struct ShardHeader { 76 uint32 add_prefix_count, sub_prefix_count; 77 uint32 add_hash_count, sub_hash_count; 78 }; 79 80 // Enumerate different format-change events for histogramming 81 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES. 82 enum FormatEventType { 83 // Corruption detected, broken down by file format. 84 FORMAT_EVENT_FILE_CORRUPT, 85 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete 86 87 // The type of format found in the file. The expected case (new 88 // file format) is intentionally not covered. 89 FORMAT_EVENT_FOUND_SQLITE, 90 FORMAT_EVENT_FOUND_UNKNOWN, 91 92 // The number of SQLite-format files deleted should be the same as 93 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails, 94 // or if a failure prevents the update from succeeding. 95 FORMAT_EVENT_SQLITE_DELETED, // Obsolete 96 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete 97 98 // Found and deleted (or failed to delete) the ancient "Safe 99 // Browsing" file. 100 FORMAT_EVENT_DELETED_ORIGINAL, 101 FORMAT_EVENT_DELETED_ORIGINAL_FAILED, 102 103 // The checksum did not check out in CheckValidity() or in 104 // FinishUpdate(). This most likely indicates that the machine 105 // crashed before the file was fully sync'ed to disk. 106 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE, 107 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE, 108 109 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely 110 // indicates that the system crashed while writing an update. 111 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE, 112 113 // Memory space for histograms is determined by the max. ALWAYS 114 // ADD NEW VALUES BEFORE THIS ONE. 115 FORMAT_EVENT_MAX 116 }; 117 118 void RecordFormatEvent(FormatEventType event_type) { 119 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX); 120 } 121 122 // Rewind the file. Using fseek(2) because rewind(3) errors are 123 // weird. 124 bool FileRewind(FILE* fp) { 125 int rv = fseek(fp, 0, SEEK_SET); 126 DCHECK_EQ(rv, 0); 127 return rv == 0; 128 } 129 130 // Read from |fp| into |item|, and fold the input data into the 131 // checksum in |context|, if non-NULL. Return true on success. 132 template <class T> 133 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) { 134 const size_t ret = fread(item, sizeof(T), 1, fp); 135 if (ret != 1) 136 return false; 137 138 if (context) { 139 base::MD5Update(context, 140 base::StringPiece(reinterpret_cast<char*>(item), 141 sizeof(T))); 142 } 143 return true; 144 } 145 146 // Write |item| to |fp|, and fold the output data into the checksum in 147 // |context|, if non-NULL. Return true on success. 148 template <class T> 149 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) { 150 const size_t ret = fwrite(&item, sizeof(T), 1, fp); 151 if (ret != 1) 152 return false; 153 154 if (context) { 155 base::MD5Update(context, 156 base::StringPiece(reinterpret_cast<const char*>(&item), 157 sizeof(T))); 158 } 159 160 return true; 161 } 162 163 // Read |count| items into |values| from |fp|, and fold them into the 164 // checksum in |context|. Returns true on success. 165 template <typename CT> 166 bool ReadToContainer(CT* values, size_t count, FILE* fp, 167 base::MD5Context* context) { 168 if (!count) 169 return true; 170 171 for (size_t i = 0; i < count; ++i) { 172 typename CT::value_type value; 173 if (!ReadItem(&value, fp, context)) 174 return false; 175 176 // push_back() is more obvious, but coded this way std::set can 177 // also be read. 178 values->insert(values->end(), value); 179 } 180 181 return true; 182 } 183 184 // Write values between |beg| and |end| to |fp|, and fold the data into the 185 // checksum in |context|, if non-NULL. Returns true if all items successful. 186 template <typename CTI> 187 bool WriteRange(const CTI& beg, const CTI& end, 188 FILE* fp, base::MD5Context* context) { 189 for (CTI iter = beg; iter != end; ++iter) { 190 if (!WriteItem(*iter, fp, context)) 191 return false; 192 } 193 return true; 194 } 195 196 // Write all of |values| to |fp|, and fold the data into the checksum 197 // in |context|, if non-NULL. Returns true if all items successful. 198 template <typename CT> 199 bool WriteContainer(const CT& values, FILE* fp, 200 base::MD5Context* context) { 201 return WriteRange(values.begin(), values.end(), fp, context); 202 } 203 204 // Delete the chunks in |deleted| from |chunks|. 205 void DeleteChunksFromSet(const base::hash_set<int32>& deleted, 206 std::set<int32>* chunks) { 207 for (std::set<int32>::iterator iter = chunks->begin(); 208 iter != chunks->end();) { 209 std::set<int32>::iterator prev = iter++; 210 if (deleted.count(*prev) > 0) 211 chunks->erase(prev); 212 } 213 } 214 215 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) { 216 base::MD5Digest calculated_digest; 217 base::MD5IntermediateFinal(&calculated_digest, context); 218 219 base::MD5Digest file_digest; 220 if (!ReadItem(&file_digest, fp, context)) 221 return false; 222 223 return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0; 224 } 225 226 // Sanity-check the header against the file's size to make sure our 227 // vectors aren't gigantic. This doubles as a cheap way to detect 228 // corruption without having to checksum the entire file. 229 bool FileHeaderV7SanityCheck(const base::FilePath& filename, 230 const FileHeaderV7& header) { 231 DCHECK_EQ(header.version, 7); 232 233 int64 size = 0; 234 if (!base::GetFileSize(filename, &size)) 235 return false; 236 237 int64 expected_size = sizeof(FileHeaderV7); 238 expected_size += header.add_chunk_count * sizeof(int32); 239 expected_size += header.sub_chunk_count * sizeof(int32); 240 expected_size += header.add_prefix_count * sizeof(SBAddPrefix); 241 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); 242 expected_size += header.add_hash_count * sizeof(SBAddFullHash); 243 expected_size += header.sub_hash_count * sizeof(SBSubFullHash); 244 expected_size += sizeof(base::MD5Digest); 245 if (size != expected_size) 246 return false; 247 248 return true; 249 } 250 251 // Helper function to read the file header and chunk TOC. Rewinds |fp| and 252 // initializes |context|. The header is left in |header|, with the version 253 // returned. kInvalidVersion is returned for sanity check or checksum failure. 254 int ReadAndVerifyHeader(const base::FilePath& filename, 255 FileHeader* header, 256 std::set<int32>* add_chunks, 257 std::set<int32>* sub_chunks, 258 FILE* fp, 259 base::MD5Context* context) { 260 DCHECK(header); 261 DCHECK(add_chunks); 262 DCHECK(sub_chunks); 263 DCHECK(fp); 264 DCHECK(context); 265 266 int version = kInvalidVersion; 267 268 base::MD5Init(context); 269 if (!FileRewind(fp)) 270 return kInvalidVersion; 271 if (!ReadItem(&header->v8, fp, context)) 272 return kInvalidVersion; 273 if (header->v8.magic != kFileMagic) 274 return kInvalidVersion; 275 276 size_t add_chunks_count = 0; 277 size_t sub_chunks_count = 0; 278 279 // Track version read to inform removal of support for older versions. 280 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->v8.version); 281 282 if (header->v8.version == 7) { 283 version = 7; 284 285 // Reset the context and re-read the v7 header. 286 base::MD5Init(context); 287 if (!FileRewind(fp)) 288 return kInvalidVersion; 289 if (!ReadItem(&header->v7, fp, context)) 290 return kInvalidVersion; 291 if (header->v7.magic != kFileMagic || header->v7.version != 7) 292 return kInvalidVersion; 293 if (!FileHeaderV7SanityCheck(filename, header->v7)) 294 return kInvalidVersion; 295 296 add_chunks_count = header->v7.add_chunk_count; 297 sub_chunks_count = header->v7.sub_chunk_count; 298 } else if (header->v8.version == kFileVersion) { 299 version = 8; 300 add_chunks_count = header->v8.add_chunk_count; 301 sub_chunks_count = header->v8.sub_chunk_count; 302 } else { 303 return kInvalidVersion; 304 } 305 306 if (!ReadToContainer(add_chunks, add_chunks_count, fp, context) || 307 !ReadToContainer(sub_chunks, sub_chunks_count, fp, context)) { 308 return kInvalidVersion; 309 } 310 311 // v8 includes a checksum to validate the header. 312 if (version > 7 && !ReadAndVerifyChecksum(fp, context)) { 313 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE); 314 return kInvalidVersion; 315 } 316 317 return version; 318 } 319 320 // Helper function to write out the initial header and chunks-contained data. 321 // Rewinds |fp|, initializes |context|, then writes a file header and 322 // |add_chunks| and |sub_chunks|. 323 bool WriteHeader(uint32 out_stride, 324 const std::set<int32>& add_chunks, 325 const std::set<int32>& sub_chunks, 326 FILE* fp, 327 base::MD5Context* context) { 328 if (!FileRewind(fp)) 329 return false; 330 331 base::MD5Init(context); 332 FileHeaderV8 header; 333 header.magic = kFileMagic; 334 header.version = kFileVersion; 335 header.add_chunk_count = add_chunks.size(); 336 header.sub_chunk_count = sub_chunks.size(); 337 header.shard_stride = out_stride; 338 if (!WriteItem(header, fp, context)) 339 return false; 340 341 if (!WriteContainer(add_chunks, fp, context) || 342 !WriteContainer(sub_chunks, fp, context)) 343 return false; 344 345 // Write out the header digest. 346 base::MD5Digest header_digest; 347 base::MD5IntermediateFinal(&header_digest, context); 348 if (!WriteItem(header_digest, fp, context)) 349 return false; 350 351 return true; 352 } 353 354 // Return |true| if the range is sorted by the given comparator. 355 template <typename CTI, typename LESS> 356 bool sorted(CTI beg, CTI end, LESS less) { 357 while ((end - beg) > 2) { 358 CTI n = beg++; 359 DCHECK(!less(*beg, *n)); 360 if (less(*beg, *n)) 361 return false; 362 } 363 return true; 364 } 365 366 // Merge |beg|..|end| into |container|. Both should be sorted by the given 367 // comparator, and the range iterators should not be derived from |container|. 368 // Differs from std::inplace_merge() in that additional memory is not required 369 // for linear performance. 370 template <typename CT, typename CTI, typename COMP> 371 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) { 372 DCHECK(sorted(container->begin(), container->end(), less)); 373 DCHECK(sorted(beg, end, less)); 374 375 // Size the container to fit the results. 376 const size_t c_size = container->size(); 377 container->resize(c_size + (end - beg)); 378 379 // |c_end| points to the original endpoint, while |c_out| points to the 380 // endpoint that will scan from end to beginning while merging. 381 typename CT::iterator c_end = container->begin() + c_size; 382 typename CT::iterator c_out = container->end(); 383 384 // While both inputs have data, move the greater to |c_out|. 385 while (c_end != container->begin() && end != beg) { 386 if (less(*(c_end - 1), *(end - 1))) { 387 *(--c_out) = *(--end); 388 } else { 389 *(--c_out) = *(--c_end); 390 } 391 } 392 393 // Copy any data remaining in the new range. 394 if (end != beg) { 395 // The original container data has been fully shifted. 396 DCHECK(c_end == container->begin()); 397 398 // There is exactly the correct amount of space left. 399 DCHECK_EQ(c_out - c_end, end - beg); 400 401 std::copy(beg, end, container->begin()); 402 } 403 404 DCHECK(sorted(container->begin(), container->end(), less)); 405 } 406 407 // Collection of iterators used while stepping through StateInternal (see 408 // below). 409 class StateInternalPos { 410 public: 411 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter, 412 SBSubPrefixes::iterator sub_prefixes_iter, 413 std::vector<SBAddFullHash>::iterator add_hashes_iter, 414 std::vector<SBSubFullHash>::iterator sub_hashes_iter) 415 : add_prefixes_iter_(add_prefixes_iter), 416 sub_prefixes_iter_(sub_prefixes_iter), 417 add_hashes_iter_(add_hashes_iter), 418 sub_hashes_iter_(sub_hashes_iter) { 419 } 420 421 SBAddPrefixes::iterator add_prefixes_iter_; 422 SBSubPrefixes::iterator sub_prefixes_iter_; 423 std::vector<SBAddFullHash>::iterator add_hashes_iter_; 424 std::vector<SBSubFullHash>::iterator sub_hashes_iter_; 425 }; 426 427 // Helper to find the next shard boundary. 428 template <class T> 429 bool prefix_bounder(SBPrefix val, const T& elt) { 430 return val < elt.GetAddPrefix(); 431 } 432 433 // Container for partial database state. Includes add/sub prefixes/hashes, plus 434 // aggregate operations on same. 435 class StateInternal { 436 public: 437 // Append indicated amount of data from |fp|. 438 bool AppendData(size_t add_prefix_count, size_t sub_prefix_count, 439 size_t add_hash_count, size_t sub_hash_count, 440 FILE* fp, base::MD5Context* context) { 441 return 442 ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) && 443 ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) && 444 ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) && 445 ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context); 446 } 447 448 void ClearData() { 449 add_prefixes_.clear(); 450 sub_prefixes_.clear(); 451 add_full_hashes_.clear(); 452 sub_full_hashes_.clear(); 453 } 454 455 // Merge data from |beg|..|end| into receiver's state, then process the state. 456 // The current state and the range given should corrospond to the same sorted 457 // shard of data from different sources. |add_del_cache| and |sub_del_cache| 458 // indicate the chunk ids which should be deleted during processing (see 459 // SBProcessSubs). 460 void MergeDataAndProcess(const StateInternalPos& beg, 461 const StateInternalPos& end, 462 const base::hash_set<int32>& add_del_cache, 463 const base::hash_set<int32>& sub_del_cache) { 464 container_merge(&add_prefixes_, 465 beg.add_prefixes_iter_, 466 end.add_prefixes_iter_, 467 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); 468 469 container_merge(&sub_prefixes_, 470 beg.sub_prefixes_iter_, 471 end.sub_prefixes_iter_, 472 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); 473 474 container_merge(&add_full_hashes_, 475 beg.add_hashes_iter_, 476 end.add_hashes_iter_, 477 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); 478 479 container_merge(&sub_full_hashes_, 480 beg.sub_hashes_iter_, 481 end.sub_hashes_iter_, 482 SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>); 483 484 SBProcessSubs(&add_prefixes_, &sub_prefixes_, 485 &add_full_hashes_, &sub_full_hashes_, 486 add_del_cache, sub_del_cache); 487 } 488 489 // Sort the data appropriately for the sharding, merging, and processing 490 // operations. 491 void SortData() { 492 std::sort(add_prefixes_.begin(), add_prefixes_.end(), 493 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); 494 std::sort(sub_prefixes_.begin(), sub_prefixes_.end(), 495 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); 496 std::sort(add_full_hashes_.begin(), add_full_hashes_.end(), 497 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); 498 std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(), 499 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); 500 } 501 502 // Iterator from the beginning of the state's data. 503 StateInternalPos StateBegin() { 504 return StateInternalPos(add_prefixes_.begin(), 505 sub_prefixes_.begin(), 506 add_full_hashes_.begin(), 507 sub_full_hashes_.begin()); 508 } 509 510 // An iterator pointing just after the last possible element of the shard 511 // indicated by |shard_max|. Used to step through the state by shard. 512 // TODO(shess): Verify whether binary search really improves over linear. 513 // Merging or writing will immediately touch all of these elements. 514 StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) { 515 return StateInternalPos( 516 std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(), 517 shard_max, prefix_bounder<SBAddPrefix>), 518 std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(), 519 shard_max, prefix_bounder<SBSubPrefix>), 520 std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(), 521 shard_max, prefix_bounder<SBAddFullHash>), 522 std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(), 523 shard_max, prefix_bounder<SBSubFullHash>)); 524 } 525 526 // Write a shard header and data for the shard starting at |beg| and ending at 527 // the element before |end|. 528 bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end, 529 FILE* fp, base::MD5Context* context) { 530 ShardHeader shard_header; 531 shard_header.add_prefix_count = 532 end.add_prefixes_iter_ - beg.add_prefixes_iter_; 533 shard_header.sub_prefix_count = 534 end.sub_prefixes_iter_ - beg.sub_prefixes_iter_; 535 shard_header.add_hash_count = 536 end.add_hashes_iter_ - beg.add_hashes_iter_; 537 shard_header.sub_hash_count = 538 end.sub_hashes_iter_ - beg.sub_hashes_iter_; 539 540 return 541 WriteItem(shard_header, fp, context) && 542 WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_, 543 fp, context) && 544 WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_, 545 fp, context) && 546 WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_, 547 fp, context) && 548 WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_, 549 fp, context); 550 } 551 552 SBAddPrefixes add_prefixes_; 553 SBSubPrefixes sub_prefixes_; 554 std::vector<SBAddFullHash> add_full_hashes_; 555 std::vector<SBSubFullHash> sub_full_hashes_; 556 }; 557 558 // True if |val| is an even power of two. 559 template <typename T> 560 bool IsPowerOfTwo(const T& val) { 561 return val && (val & (val - 1)) == 0; 562 } 563 564 // Helper to read the entire database state, used by GetAddPrefixes() and 565 // GetAddFullHashes(). Those functions are generally used only for smaller 566 // files. Returns false in case of errors reading the data. 567 bool ReadDbStateHelper(const base::FilePath& filename, 568 StateInternal* db_state) { 569 file_util::ScopedFILE file(base::OpenFile(filename, "rb")); 570 if (file.get() == NULL) 571 return false; 572 573 std::set<int32> add_chunks; 574 std::set<int32> sub_chunks; 575 576 base::MD5Context context; 577 FileHeader header; 578 const int version = 579 ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks, 580 file.get(), &context); 581 if (version == kInvalidVersion) 582 return false; 583 584 if (version == 7) { 585 if (!db_state->AppendData(header.v7.add_prefix_count, 586 header.v7.sub_prefix_count, 587 header.v7.add_hash_count, 588 header.v7.sub_hash_count, 589 file.get(), &context)) { 590 return false; 591 } 592 593 // v7 data was not stored sorted. 594 db_state->SortData(); 595 } else { 596 // Read until the shard start overflows, always at least one pass. 597 uint64 in_min = 0; 598 uint64 in_stride = header.v8.shard_stride; 599 if (!in_stride) 600 in_stride = kMaxShardStride; 601 if (!IsPowerOfTwo(in_stride)) 602 return false; 603 604 do { 605 ShardHeader shard_header; 606 if (!ReadItem(&shard_header, file.get(), &context)) 607 return false; 608 609 if (!db_state->AppendData(shard_header.add_prefix_count, 610 shard_header.sub_prefix_count, 611 shard_header.add_hash_count, 612 shard_header.sub_hash_count, 613 file.get(), &context)) { 614 return false; 615 } 616 617 in_min += in_stride; 618 } while (in_min <= kMaxSBPrefix); 619 } 620 621 if (!ReadAndVerifyChecksum(file.get(), &context)) 622 return false; 623 624 int64 size = 0; 625 if (!base::GetFileSize(filename, &size)) 626 return false; 627 628 return static_cast<int64>(ftell(file.get())) == size; 629 } 630 631 } // namespace 632 633 // static 634 void SafeBrowsingStoreFile::CheckForOriginalAndDelete( 635 const base::FilePath& current_filename) { 636 const base::FilePath original_filename( 637 current_filename.DirName().AppendASCII("Safe Browsing")); 638 if (base::PathExists(original_filename)) { 639 int64 size = 0; 640 if (base::GetFileSize(original_filename, &size)) { 641 UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes", 642 static_cast<int>(size / 1024)); 643 } 644 645 if (base::DeleteFile(original_filename, false)) { 646 RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL); 647 } else { 648 RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED); 649 } 650 651 // Just best-effort on the journal file, don't want to get lost in 652 // the weeds. 653 const base::FilePath journal_filename( 654 current_filename.DirName().AppendASCII("Safe Browsing-journal")); 655 base::DeleteFile(journal_filename, false); 656 } 657 } 658 659 SafeBrowsingStoreFile::SafeBrowsingStoreFile() 660 : chunks_written_(0), empty_(false), corruption_seen_(false) {} 661 662 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() { 663 Close(); 664 } 665 666 bool SafeBrowsingStoreFile::Delete() { 667 // The database should not be open at this point. But, just in 668 // case, close everything before deleting. 669 if (!Close()) { 670 NOTREACHED(); 671 return false; 672 } 673 674 return DeleteStore(filename_); 675 } 676 677 bool SafeBrowsingStoreFile::CheckValidity() { 678 // The file was either empty or never opened. The empty case is 679 // presumed not to be invalid. The never-opened case can happen if 680 // BeginUpdate() fails for any databases, and should already have 681 // caused the corruption callback to fire. 682 if (!file_.get()) 683 return true; 684 685 if (!FileRewind(file_.get())) 686 return OnCorruptDatabase(); 687 688 int64 size = 0; 689 if (!base::GetFileSize(filename_, &size)) 690 return OnCorruptDatabase(); 691 692 base::MD5Context context; 693 base::MD5Init(&context); 694 695 // Read everything except the final digest. 696 size_t bytes_left = static_cast<size_t>(size); 697 CHECK(size == static_cast<int64>(bytes_left)); 698 if (bytes_left < sizeof(base::MD5Digest)) 699 return OnCorruptDatabase(); 700 bytes_left -= sizeof(base::MD5Digest); 701 702 // Fold the contents of the file into the checksum. 703 while (bytes_left > 0) { 704 char buf[4096]; 705 const size_t c = std::min(sizeof(buf), bytes_left); 706 const size_t ret = fread(buf, 1, c, file_.get()); 707 708 // The file's size changed while reading, give up. 709 if (ret != c) 710 return OnCorruptDatabase(); 711 base::MD5Update(&context, base::StringPiece(buf, c)); 712 bytes_left -= c; 713 } 714 715 if (!ReadAndVerifyChecksum(file_.get(), &context)) { 716 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE); 717 return OnCorruptDatabase(); 718 } 719 720 return true; 721 } 722 723 void SafeBrowsingStoreFile::Init( 724 const base::FilePath& filename, 725 const base::Closure& corruption_callback 726 ) { 727 filename_ = filename; 728 corruption_callback_ = corruption_callback; 729 } 730 731 bool SafeBrowsingStoreFile::BeginChunk() { 732 return ClearChunkBuffers(); 733 } 734 735 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) { 736 add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix)); 737 return true; 738 } 739 740 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) { 741 add_prefixes->clear(); 742 if (!base::PathExists(filename_)) 743 return true; 744 745 StateInternal db_state; 746 if (!ReadDbStateHelper(filename_, &db_state)) 747 return OnCorruptDatabase(); 748 749 add_prefixes->swap(db_state.add_prefixes_); 750 return true; 751 } 752 753 bool SafeBrowsingStoreFile::GetAddFullHashes( 754 std::vector<SBAddFullHash>* add_full_hashes) { 755 add_full_hashes->clear(); 756 if (!base::PathExists(filename_)) 757 return true; 758 759 StateInternal db_state; 760 if (!ReadDbStateHelper(filename_, &db_state)) 761 return OnCorruptDatabase(); 762 763 add_full_hashes->swap(db_state.add_full_hashes_); 764 return true; 765 } 766 767 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id, 768 const SBFullHash& full_hash) { 769 add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash)); 770 return true; 771 } 772 773 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id, 774 int32 add_chunk_id, 775 SBPrefix prefix) { 776 sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix)); 777 return true; 778 } 779 780 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id, 781 const SBFullHash& full_hash) { 782 sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash)); 783 return true; 784 } 785 786 bool SafeBrowsingStoreFile::OnCorruptDatabase() { 787 if (!corruption_seen_) 788 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT); 789 corruption_seen_ = true; 790 791 corruption_callback_.Run(); 792 793 // Return false as a convenience to callers. 794 return false; 795 } 796 797 bool SafeBrowsingStoreFile::Close() { 798 ClearUpdateBuffers(); 799 800 // Make sure the files are closed. 801 file_.reset(); 802 new_file_.reset(); 803 return true; 804 } 805 806 bool SafeBrowsingStoreFile::BeginUpdate() { 807 DCHECK(!file_.get() && !new_file_.get()); 808 809 // Structures should all be clear unless something bad happened. 810 DCHECK(add_chunks_cache_.empty()); 811 DCHECK(sub_chunks_cache_.empty()); 812 DCHECK(add_del_cache_.empty()); 813 DCHECK(sub_del_cache_.empty()); 814 DCHECK(add_prefixes_.empty()); 815 DCHECK(sub_prefixes_.empty()); 816 DCHECK(add_hashes_.empty()); 817 DCHECK(sub_hashes_.empty()); 818 DCHECK_EQ(chunks_written_, 0); 819 820 // Since the following code will already hit the profile looking for 821 // database files, this is a reasonable to time delete any old 822 // files. 823 CheckForOriginalAndDelete(filename_); 824 825 corruption_seen_ = false; 826 827 const base::FilePath new_filename = TemporaryFileForFilename(filename_); 828 base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+")); 829 if (new_file.get() == NULL) 830 return false; 831 832 base::ScopedFILE file(base::OpenFile(filename_, "rb")); 833 empty_ = (file.get() == NULL); 834 if (empty_) { 835 // If the file exists but cannot be opened, try to delete it (not 836 // deleting directly, the bloom filter needs to be deleted, too). 837 if (base::PathExists(filename_)) 838 return OnCorruptDatabase(); 839 840 new_file_.swap(new_file); 841 return true; 842 } 843 844 base::MD5Context context; 845 FileHeader header; 846 const int version = 847 ReadAndVerifyHeader(filename_, &header, 848 &add_chunks_cache_, &sub_chunks_cache_, 849 file.get(), &context); 850 if (version == kInvalidVersion) { 851 FileHeaderV8 retry_header; 852 if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL) && 853 (retry_header.magic != kFileMagic || 854 (retry_header.version != 8 && retry_header.version != 7))) { 855 // TODO(shess): Think on whether these histograms are generating any 856 // actionable data. I kid you not, SQLITE happens many thousands of times 857 // per day, UNKNOWN about 3x higher than that. 858 if (!strcmp(reinterpret_cast<char*>(&retry_header.magic), 859 "SQLite format 3")) { 860 RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE); 861 } else { 862 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN); 863 } 864 } 865 866 // Close the file so that it can be deleted. 867 file.reset(); 868 869 return OnCorruptDatabase(); 870 } 871 872 file_.swap(file); 873 new_file_.swap(new_file); 874 return true; 875 } 876 877 bool SafeBrowsingStoreFile::FinishChunk() { 878 if (!add_prefixes_.size() && !sub_prefixes_.size() && 879 !add_hashes_.size() && !sub_hashes_.size()) 880 return true; 881 882 ChunkHeader header; 883 header.add_prefix_count = add_prefixes_.size(); 884 header.sub_prefix_count = sub_prefixes_.size(); 885 header.add_hash_count = add_hashes_.size(); 886 header.sub_hash_count = sub_hashes_.size(); 887 if (!WriteItem(header, new_file_.get(), NULL)) 888 return false; 889 890 if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) || 891 !WriteContainer(sub_prefixes_, new_file_.get(), NULL) || 892 !WriteContainer(add_hashes_, new_file_.get(), NULL) || 893 !WriteContainer(sub_hashes_, new_file_.get(), NULL)) 894 return false; 895 896 ++chunks_written_; 897 898 // Clear everything to save memory. 899 return ClearChunkBuffers(); 900 } 901 902 bool SafeBrowsingStoreFile::DoUpdate( 903 safe_browsing::PrefixSetBuilder* builder, 904 std::vector<SBAddFullHash>* add_full_hashes_result) { 905 DCHECK(file_.get() || empty_); 906 DCHECK(new_file_.get()); 907 CHECK(builder); 908 CHECK(add_full_hashes_result); 909 910 // Rewind the temporary storage. 911 if (!FileRewind(new_file_.get())) 912 return false; 913 914 // Get chunk file's size for validating counts. 915 int64 update_size = 0; 916 if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size)) 917 return OnCorruptDatabase(); 918 919 // Track update size to answer questions at http://crbug.com/72216 . 920 // Log small updates as 1k so that the 0 (underflow) bucket can be 921 // used for "empty" in SafeBrowsingDatabase. 922 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes", 923 std::max(static_cast<int>(update_size / 1024), 1)); 924 925 // Chunk updates to integrate. 926 StateInternal new_state; 927 928 // Read update chunks. 929 for (int i = 0; i < chunks_written_; ++i) { 930 ChunkHeader header; 931 932 int64 ofs = ftell(new_file_.get()); 933 if (ofs == -1) 934 return false; 935 936 if (!ReadItem(&header, new_file_.get(), NULL)) 937 return false; 938 939 // As a safety measure, make sure that the header describes a sane 940 // chunk, given the remaining file size. 941 int64 expected_size = ofs + sizeof(ChunkHeader); 942 expected_size += header.add_prefix_count * sizeof(SBAddPrefix); 943 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); 944 expected_size += header.add_hash_count * sizeof(SBAddFullHash); 945 expected_size += header.sub_hash_count * sizeof(SBSubFullHash); 946 if (expected_size > update_size) 947 return false; 948 949 if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count, 950 header.add_hash_count, header.sub_hash_count, 951 new_file_.get(), NULL)) { 952 return false; 953 } 954 } 955 956 // The state was accumulated by chunk, sort by prefix. 957 new_state.SortData(); 958 959 // These strides control how much data is loaded into memory per pass. 960 // Strides must be an even power of two. |in_stride| will be derived from the 961 // input file. |out_stride| will be derived from an estimate of the resulting 962 // file's size. |process_stride| will be the max of both. 963 uint64 in_stride = kMaxShardStride; 964 uint64 out_stride = kMaxShardStride; 965 uint64 process_stride = 0; 966 967 // The header info is only used later if |!empty_|. The v8 read loop only 968 // needs |in_stride|, while v7 needs to refer to header information. 969 base::MD5Context in_context; 970 int version = kInvalidVersion; 971 FileHeader header; 972 973 if (!empty_) { 974 DCHECK(file_.get()); 975 976 version = ReadAndVerifyHeader(filename_, &header, 977 &add_chunks_cache_, &sub_chunks_cache_, 978 file_.get(), &in_context); 979 if (version == kInvalidVersion) 980 return OnCorruptDatabase(); 981 982 if (version == 8 && header.v8.shard_stride) 983 in_stride = header.v8.shard_stride; 984 985 // The header checksum should have prevented this case, but the code will be 986 // broken if this is not correct. 987 if (!IsPowerOfTwo(in_stride)) 988 return OnCorruptDatabase(); 989 } 990 991 // We no longer need to track deleted chunks. 992 DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_); 993 DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_); 994 995 // Calculate |out_stride| to break the file down into reasonable shards. 996 { 997 int64 original_size = 0; 998 if (!empty_ && !base::GetFileSize(filename_, &original_size)) 999 return OnCorruptDatabase(); 1000 1001 // Approximate the final size as everything. Subs and deletes will reduce 1002 // the size, but modest over-sharding won't hurt much. 1003 int64 shard_size = original_size + update_size; 1004 1005 // Keep splitting until a single stride of data fits the target. 1006 size_t shifts = 0; 1007 while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) { 1008 out_stride >>= 1; 1009 shard_size >>= 1; 1010 ++shifts; 1011 } 1012 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts); 1013 1014 DCHECK(IsPowerOfTwo(out_stride)); 1015 } 1016 1017 // Outer loop strides by the max of the input stride (to read integral shards) 1018 // and the output stride (to write integral shards). 1019 process_stride = std::max(in_stride, out_stride); 1020 DCHECK(IsPowerOfTwo(process_stride)); 1021 DCHECK_EQ(0u, process_stride % in_stride); 1022 DCHECK_EQ(0u, process_stride % out_stride); 1023 1024 // Start writing the new data to |new_file_|. 1025 base::MD5Context out_context; 1026 if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_, 1027 new_file_.get(), &out_context)) { 1028 return false; 1029 } 1030 1031 // Start at the beginning of the SBPrefix space. 1032 uint64 in_min = 0; 1033 uint64 out_min = 0; 1034 uint64 process_min = 0; 1035 1036 // Start at the beginning of the updates. 1037 StateInternalPos new_pos = new_state.StateBegin(); 1038 1039 // Re-usable container for shard processing. 1040 StateInternal db_state; 1041 1042 // Track aggregate counts for histograms. 1043 size_t add_prefix_count = 0; 1044 size_t sub_prefix_count = 0; 1045 1046 do { 1047 // Maximum element in the current shard. 1048 SBPrefix process_max = 1049 static_cast<SBPrefix>(process_min + process_stride - 1); 1050 DCHECK_GT(process_max, process_min); 1051 1052 // Drop the data from previous pass. 1053 db_state.ClearData(); 1054 1055 // Fill the processing shard with one or more input shards. 1056 if (!empty_) { 1057 if (version == 7) { 1058 // Treat v7 as a single-shard file. 1059 DCHECK_EQ(in_min, 0u); 1060 DCHECK_EQ(in_stride, kMaxShardStride); 1061 DCHECK_EQ(process_stride, kMaxShardStride); 1062 if (!db_state.AppendData(header.v7.add_prefix_count, 1063 header.v7.sub_prefix_count, 1064 header.v7.add_hash_count, 1065 header.v7.sub_hash_count, 1066 file_.get(), &in_context)) 1067 return OnCorruptDatabase(); 1068 1069 // v7 data is not sorted correctly. 1070 db_state.SortData(); 1071 } else { 1072 do { 1073 ShardHeader shard_header; 1074 if (!ReadItem(&shard_header, file_.get(), &in_context)) 1075 return OnCorruptDatabase(); 1076 1077 if (!db_state.AppendData(shard_header.add_prefix_count, 1078 shard_header.sub_prefix_count, 1079 shard_header.add_hash_count, 1080 shard_header.sub_hash_count, 1081 file_.get(), &in_context)) 1082 return OnCorruptDatabase(); 1083 1084 in_min += in_stride; 1085 } while (in_min <= kMaxSBPrefix && in_min < process_max); 1086 } 1087 } 1088 1089 // Shard the update data to match the database data, then merge the update 1090 // data and process the results. 1091 { 1092 StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max); 1093 db_state.MergeDataAndProcess(new_pos, new_end, 1094 add_del_cache_, sub_del_cache_); 1095 new_pos = new_end; 1096 } 1097 1098 // Collect the processed data for return to caller. 1099 for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) { 1100 builder->AddPrefix(db_state.add_prefixes_[i].prefix); 1101 } 1102 add_full_hashes_result->insert(add_full_hashes_result->end(), 1103 db_state.add_full_hashes_.begin(), 1104 db_state.add_full_hashes_.end()); 1105 add_prefix_count += db_state.add_prefixes_.size(); 1106 sub_prefix_count += db_state.sub_prefixes_.size(); 1107 1108 // Write one or more shards of processed output. 1109 StateInternalPos out_pos = db_state.StateBegin(); 1110 do { 1111 SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1); 1112 DCHECK_GT(out_max, out_min); 1113 1114 StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max); 1115 if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context)) 1116 return false; 1117 out_pos = out_end; 1118 1119 out_min += out_stride; 1120 } while (out_min == static_cast<SBPrefix>(out_min) && 1121 out_min < process_max); 1122 1123 process_min += process_stride; 1124 } while (process_min <= kMaxSBPrefix); 1125 1126 // Verify the overall checksum. 1127 if (!empty_) { 1128 if (!ReadAndVerifyChecksum(file_.get(), &in_context)) { 1129 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE); 1130 return OnCorruptDatabase(); 1131 } 1132 1133 // TODO(shess): Verify EOF? 1134 1135 // Close the input file so the new file can be renamed over it. 1136 file_.reset(); 1137 } 1138 DCHECK(!file_.get()); 1139 1140 // Write the overall checksum. 1141 base::MD5Digest out_digest; 1142 base::MD5Final(&out_digest, &out_context); 1143 if (!WriteItem(out_digest, new_file_.get(), NULL)) 1144 return false; 1145 1146 // Trim any excess left over from the temporary chunk data. 1147 if (!base::TruncateFile(new_file_.get())) 1148 return false; 1149 1150 // Close the file handle and swizzle the file into place. 1151 new_file_.reset(); 1152 if (!base::DeleteFile(filename_, false) && 1153 base::PathExists(filename_)) 1154 return false; 1155 1156 const base::FilePath new_filename = TemporaryFileForFilename(filename_); 1157 if (!base::Move(new_filename, filename_)) 1158 return false; 1159 1160 // Record counts before swapping to caller. 1161 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count); 1162 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count); 1163 1164 return true; 1165 } 1166 1167 bool SafeBrowsingStoreFile::FinishUpdate( 1168 safe_browsing::PrefixSetBuilder* builder, 1169 std::vector<SBAddFullHash>* add_full_hashes_result) { 1170 DCHECK(builder); 1171 DCHECK(add_full_hashes_result); 1172 1173 if (!DoUpdate(builder, add_full_hashes_result)) { 1174 CancelUpdate(); 1175 return false; 1176 } 1177 1178 DCHECK(!new_file_.get()); 1179 DCHECK(!file_.get()); 1180 1181 return Close(); 1182 } 1183 1184 bool SafeBrowsingStoreFile::CancelUpdate() { 1185 return Close(); 1186 } 1187 1188 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) { 1189 add_chunks_cache_.insert(chunk_id); 1190 } 1191 1192 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) { 1193 return add_chunks_cache_.count(chunk_id) > 0; 1194 } 1195 1196 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) { 1197 out->clear(); 1198 out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end()); 1199 } 1200 1201 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) { 1202 sub_chunks_cache_.insert(chunk_id); 1203 } 1204 1205 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) { 1206 return sub_chunks_cache_.count(chunk_id) > 0; 1207 } 1208 1209 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) { 1210 out->clear(); 1211 out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end()); 1212 } 1213 1214 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) { 1215 add_del_cache_.insert(chunk_id); 1216 } 1217 1218 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) { 1219 sub_del_cache_.insert(chunk_id); 1220 } 1221 1222 // static 1223 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) { 1224 if (!base::DeleteFile(basename, false) && 1225 base::PathExists(basename)) { 1226 NOTREACHED(); 1227 return false; 1228 } 1229 1230 const base::FilePath new_filename = TemporaryFileForFilename(basename); 1231 if (!base::DeleteFile(new_filename, false) && 1232 base::PathExists(new_filename)) { 1233 NOTREACHED(); 1234 return false; 1235 } 1236 1237 // With SQLite support gone, one way to get to this code is if the 1238 // existing file is a SQLite file. Make sure the journal file is 1239 // also removed. 1240 const base::FilePath journal_filename( 1241 basename.value() + FILE_PATH_LITERAL("-journal")); 1242 if (base::PathExists(journal_filename)) 1243 base::DeleteFile(journal_filename, false); 1244 1245 return true; 1246 } 1247