Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
      6 
      7 #include "base/files/file_util.h"
      8 #include "base/files/scoped_file.h"
      9 #include "base/md5.h"
     10 #include "base/metrics/histogram.h"
     11 #include "base/metrics/sparse_histogram.h"
     12 
     13 namespace {
     14 
     15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
     16 // that byte-order changes force corruption.
     17 const int32 kFileMagic = 0x600D71FE;
     18 
     19 // Version history:
     20 // Version 6: aad08754/r2814 by erikkay (at) google.com on 2008-10-02 (sqlite)
     21 // Version 7: 6afe28a5/r37435 by shess (at) chromium.org on 2010-01-28
     22 // Version 8: d3dd0715/r259791 by shess (at) chromium.org on 2014-03-27
     23 const int32 kFileVersion = 8;
     24 
     25 // ReadAndVerifyHeader() returns this in case of error.
     26 const int32 kInvalidVersion = -1;
     27 
     28 // Starting with version 8, the storage is sorted and can be sharded to allow
     29 // updates to be done with lower memory requirements.  Newly written files will
     30 // be sharded to need less than this amount of memory during update.  Larger
     31 // values are preferred to minimize looping overhead during processing.
     32 const int64 kUpdateStorageBytes = 100 * 1024;
     33 
     34 // Prevent excessive sharding by setting a lower limit on the shard stride.
     35 // Smaller values should work fine, but very small values will probably lead to
     36 // poor performance.  Shard stride is indirectly related to
     37 // |kUpdateStorageBytes|, setting that very small will bump against this.
     38 const uint32 kMinShardStride = 1 << 24;
     39 
     40 // Strides over the entire SBPrefix space.
     41 const uint64 kMaxShardStride = 1ULL << 32;
     42 
     43 // Maximum SBPrefix value.
     44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
     45 
     46 // Header at the front of the main database file.
     47 struct FileHeader {
     48   int32 magic, version;
     49   uint32 add_chunk_count, sub_chunk_count;
     50   uint32 shard_stride;
     51   // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
     52   // specialized read/write?
     53 };
     54 
     55 // Header for each chunk in the chunk-accumulation file.
     56 struct ChunkHeader {
     57   uint32 add_prefix_count, sub_prefix_count;
     58   uint32 add_hash_count, sub_hash_count;
     59 };
     60 
     61 // Header for each shard of data in the main database file.
     62 struct ShardHeader {
     63   uint32 add_prefix_count, sub_prefix_count;
     64   uint32 add_hash_count, sub_hash_count;
     65 };
     66 
     67 // Enumerate different format-change events for histogramming
     68 // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
     69 enum FormatEventType {
     70   // Corruption detected, broken down by file format.
     71   FORMAT_EVENT_FILE_CORRUPT,
     72   FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
     73 
     74   // The type of format found in the file.  The expected case (new
     75   // file format) is intentionally not covered.
     76   FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
     77   FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.
     78 
     79   // The number of SQLite-format files deleted should be the same as
     80   // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
     81   // or if a failure prevents the update from succeeding.
     82   FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
     83   FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
     84 
     85   // Found and deleted (or failed to delete) the ancient "Safe
     86   // Browsing" file.
     87   FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
     88   FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete
     89 
     90   // The checksum did not check out in CheckValidity() or in
     91   // FinishUpdate().  This most likely indicates that the machine
     92   // crashed before the file was fully sync'ed to disk.
     93   FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
     94   FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
     95 
     96   // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
     97   // indicates that the system crashed while writing an update.
     98   FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
     99 
    100   FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.
    101 
    102   // Memory space for histograms is determined by the max.  ALWAYS
    103   // ADD NEW VALUES BEFORE THIS ONE.
    104   FORMAT_EVENT_MAX
    105 };
    106 
    107 void RecordFormatEvent(FormatEventType event_type) {
    108   UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
    109 }
    110 
    111 // Rewind the file.  Using fseek(2) because rewind(3) errors are
    112 // weird.
    113 bool FileRewind(FILE* fp) {
    114   int rv = fseek(fp, 0, SEEK_SET);
    115   DCHECK_EQ(rv, 0);
    116   return rv == 0;
    117 }
    118 
    119 // Read from |fp| into |item|, and fold the input data into the
    120 // checksum in |context|, if non-NULL.  Return true on success.
    121 template <class T>
    122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
    123   const size_t ret = fread(item, sizeof(T), 1, fp);
    124   if (ret != 1)
    125     return false;
    126 
    127   if (context) {
    128     base::MD5Update(context,
    129                     base::StringPiece(reinterpret_cast<char*>(item),
    130                                       sizeof(T)));
    131   }
    132   return true;
    133 }
    134 
    135 // Write |item| to |fp|, and fold the output data into the checksum in
    136 // |context|, if non-NULL.  Return true on success.
    137 template <class T>
    138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
    139   const size_t ret = fwrite(&item, sizeof(T), 1, fp);
    140   if (ret != 1)
    141     return false;
    142 
    143   if (context) {
    144     base::MD5Update(context,
    145                     base::StringPiece(reinterpret_cast<const char*>(&item),
    146                                       sizeof(T)));
    147   }
    148 
    149   return true;
    150 }
    151 
    152 // Read |count| items into |values| from |fp|, and fold them into the
    153 // checksum in |context|.  Returns true on success.
    154 template <typename CT>
    155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
    156                      base::MD5Context* context) {
    157   if (!count)
    158     return true;
    159 
    160   for (size_t i = 0; i < count; ++i) {
    161     typename CT::value_type value;
    162     if (!ReadItem(&value, fp, context))
    163       return false;
    164 
    165     // push_back() is more obvious, but coded this way std::set can
    166     // also be read.
    167     values->insert(values->end(), value);
    168   }
    169 
    170   return true;
    171 }
    172 
    173 // Write values between |beg| and |end| to |fp|, and fold the data into the
    174 // checksum in |context|, if non-NULL.  Returns true if all items successful.
    175 template <typename CTI>
    176 bool WriteRange(const CTI& beg, const CTI& end,
    177                 FILE* fp, base::MD5Context* context) {
    178   for (CTI iter = beg; iter != end; ++iter) {
    179     if (!WriteItem(*iter, fp, context))
    180       return false;
    181   }
    182   return true;
    183 }
    184 
    185 // Write all of |values| to |fp|, and fold the data into the checksum
    186 // in |context|, if non-NULL.  Returns true if all items successful.
    187 template <typename CT>
    188 bool WriteContainer(const CT& values, FILE* fp,
    189                     base::MD5Context* context) {
    190   return WriteRange(values.begin(), values.end(), fp, context);
    191 }
    192 
    193 // Delete the chunks in |deleted| from |chunks|.
    194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
    195                          std::set<int32>* chunks) {
    196   for (std::set<int32>::iterator iter = chunks->begin();
    197        iter != chunks->end();) {
    198     std::set<int32>::iterator prev = iter++;
    199     if (deleted.count(*prev) > 0)
    200       chunks->erase(prev);
    201   }
    202 }
    203 
    204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
    205   base::MD5Digest calculated_digest;
    206   base::MD5IntermediateFinal(&calculated_digest, context);
    207 
    208   base::MD5Digest file_digest;
    209   if (!ReadItem(&file_digest, fp, context))
    210     return false;
    211 
    212   return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
    213 }
    214 
    215 // Helper function to read the file header and chunk TOC.  Rewinds |fp| and
    216 // initializes |context|.  The header is left in |header|, with the version
    217 // returned.  kInvalidVersion is returned for sanity check or checksum failure.
    218 int ReadAndVerifyHeader(const base::FilePath& filename,
    219                         FileHeader* header,
    220                         std::set<int32>* add_chunks,
    221                         std::set<int32>* sub_chunks,
    222                         FILE* fp,
    223                         base::MD5Context* context) {
    224   DCHECK(header);
    225   DCHECK(add_chunks);
    226   DCHECK(sub_chunks);
    227   DCHECK(fp);
    228   DCHECK(context);
    229 
    230   base::MD5Init(context);
    231   if (!FileRewind(fp))
    232     return kInvalidVersion;
    233   if (!ReadItem(header, fp, context))
    234     return kInvalidVersion;
    235   if (header->magic != kFileMagic)
    236     return kInvalidVersion;
    237 
    238   // Track version read to inform removal of support for older versions.
    239   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
    240 
    241   if (header->version != kFileVersion)
    242     return kInvalidVersion;
    243 
    244   if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
    245       !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
    246     return kInvalidVersion;
    247   }
    248 
    249   // Verify that the data read thus far is valid.
    250   if (!ReadAndVerifyChecksum(fp, context)) {
    251     RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
    252     return kInvalidVersion;
    253   }
    254 
    255   return kFileVersion;
    256 }
    257 
    258 // Helper function to write out the initial header and chunks-contained data.
    259 // Rewinds |fp|, initializes |context|, then writes a file header and
    260 // |add_chunks| and |sub_chunks|.
    261 bool WriteHeader(uint32 out_stride,
    262                  const std::set<int32>& add_chunks,
    263                  const std::set<int32>& sub_chunks,
    264                  FILE* fp,
    265                  base::MD5Context* context) {
    266   if (!FileRewind(fp))
    267     return false;
    268 
    269   base::MD5Init(context);
    270   FileHeader header;
    271   header.magic = kFileMagic;
    272   header.version = kFileVersion;
    273   header.add_chunk_count = add_chunks.size();
    274   header.sub_chunk_count = sub_chunks.size();
    275   header.shard_stride = out_stride;
    276   if (!WriteItem(header, fp, context))
    277     return false;
    278 
    279   if (!WriteContainer(add_chunks, fp, context) ||
    280       !WriteContainer(sub_chunks, fp, context))
    281     return false;
    282 
    283   // Write out the header digest.
    284   base::MD5Digest header_digest;
    285   base::MD5IntermediateFinal(&header_digest, context);
    286   if (!WriteItem(header_digest, fp, context))
    287     return false;
    288 
    289   return true;
    290 }
    291 
    292 // Return |true| if the range is sorted by the given comparator.
    293 template <typename CTI, typename LESS>
    294 bool sorted(CTI beg, CTI end, LESS less) {
    295   while ((end - beg) > 2) {
    296     CTI n = beg++;
    297     DCHECK(!less(*beg, *n));
    298     if (less(*beg, *n))
    299       return false;
    300   }
    301   return true;
    302 }
    303 
    304 // Merge |beg|..|end| into |container|.  Both should be sorted by the given
    305 // comparator, and the range iterators should not be derived from |container|.
    306 // Differs from std::inplace_merge() in that additional memory is not required
    307 // for linear performance.
    308 template <typename CT, typename CTI, typename COMP>
    309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
    310   DCHECK(sorted(container->begin(), container->end(), less));
    311   DCHECK(sorted(beg, end, less));
    312 
    313   // Size the container to fit the results.
    314   const size_t c_size = container->size();
    315   container->resize(c_size + (end - beg));
    316 
    317   // |c_end| points to the original endpoint, while |c_out| points to the
    318   // endpoint that will scan from end to beginning while merging.
    319   typename CT::iterator c_end = container->begin() + c_size;
    320   typename CT::iterator c_out = container->end();
    321 
    322   // While both inputs have data, move the greater to |c_out|.
    323   while (c_end != container->begin() && end != beg) {
    324     if (less(*(c_end - 1), *(end - 1))) {
    325       *(--c_out) = *(--end);
    326     } else {
    327       *(--c_out) = *(--c_end);
    328     }
    329   }
    330 
    331   // Copy any data remaining in the new range.
    332   if (end != beg) {
    333     // The original container data has been fully shifted.
    334     DCHECK(c_end == container->begin());
    335 
    336     // There is exactly the correct amount of space left.
    337     DCHECK_EQ(c_out - c_end, end - beg);
    338 
    339     std::copy(beg, end, container->begin());
    340   }
    341 
    342   DCHECK(sorted(container->begin(), container->end(), less));
    343 }
    344 
    345 // Collection of iterators used while stepping through StateInternal (see
    346 // below).
    347 class StateInternalPos {
    348  public:
    349   StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
    350                    SBSubPrefixes::iterator sub_prefixes_iter,
    351                    std::vector<SBAddFullHash>::iterator add_hashes_iter,
    352                    std::vector<SBSubFullHash>::iterator sub_hashes_iter)
    353       : add_prefixes_iter_(add_prefixes_iter),
    354         sub_prefixes_iter_(sub_prefixes_iter),
    355         add_hashes_iter_(add_hashes_iter),
    356         sub_hashes_iter_(sub_hashes_iter) {
    357   }
    358 
    359   SBAddPrefixes::iterator add_prefixes_iter_;
    360   SBSubPrefixes::iterator sub_prefixes_iter_;
    361   std::vector<SBAddFullHash>::iterator add_hashes_iter_;
    362   std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
    363 };
    364 
    365 // Helper to find the next shard boundary.
    366 template <class T>
    367 bool prefix_bounder(SBPrefix val, const T& elt) {
    368   return val < elt.GetAddPrefix();
    369 }
    370 
    371 // Container for partial database state.  Includes add/sub prefixes/hashes, plus
    372 // aggregate operations on same.
    373 class StateInternal {
    374  public:
    375   // Append indicated amount of data from |fp|.
    376   bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
    377                   size_t add_hash_count, size_t sub_hash_count,
    378                   FILE* fp, base::MD5Context* context) {
    379     return
    380         ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
    381         ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
    382         ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
    383         ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
    384   }
    385 
    386   void ClearData() {
    387     add_prefixes_.clear();
    388     sub_prefixes_.clear();
    389     add_full_hashes_.clear();
    390     sub_full_hashes_.clear();
    391   }
    392 
    393   // Merge data from |beg|..|end| into receiver's state, then process the state.
    394   // The current state and the range given should corrospond to the same sorted
    395   // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
    396   // indicate the chunk ids which should be deleted during processing (see
    397   // SBProcessSubs).
    398   void MergeDataAndProcess(const StateInternalPos& beg,
    399                            const StateInternalPos& end,
    400                            const base::hash_set<int32>& add_del_cache,
    401                            const base::hash_set<int32>& sub_del_cache) {
    402     container_merge(&add_prefixes_,
    403                     beg.add_prefixes_iter_,
    404                     end.add_prefixes_iter_,
    405                     SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
    406 
    407     container_merge(&sub_prefixes_,
    408                     beg.sub_prefixes_iter_,
    409                     end.sub_prefixes_iter_,
    410                     SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
    411 
    412     container_merge(&add_full_hashes_,
    413                     beg.add_hashes_iter_,
    414                     end.add_hashes_iter_,
    415                     SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
    416 
    417     container_merge(&sub_full_hashes_,
    418                     beg.sub_hashes_iter_,
    419                     end.sub_hashes_iter_,
    420                     SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
    421 
    422     SBProcessSubs(&add_prefixes_, &sub_prefixes_,
    423                   &add_full_hashes_, &sub_full_hashes_,
    424                   add_del_cache, sub_del_cache);
    425   }
    426 
    427   // Sort the data appropriately for the sharding, merging, and processing
    428   // operations.
    429   void SortData() {
    430     std::sort(add_prefixes_.begin(), add_prefixes_.end(),
    431               SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
    432     std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
    433               SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
    434     std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
    435               SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
    436     std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
    437               SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
    438   }
    439 
    440   // Iterator from the beginning of the state's data.
    441   StateInternalPos StateBegin() {
    442     return StateInternalPos(add_prefixes_.begin(),
    443                             sub_prefixes_.begin(),
    444                             add_full_hashes_.begin(),
    445                             sub_full_hashes_.begin());
    446   }
    447 
    448   // An iterator pointing just after the last possible element of the shard
    449   // indicated by |shard_max|.  Used to step through the state by shard.
    450   // TODO(shess): Verify whether binary search really improves over linear.
    451   // Merging or writing will immediately touch all of these elements.
    452   StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
    453     return StateInternalPos(
    454         std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
    455                          shard_max, prefix_bounder<SBAddPrefix>),
    456         std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
    457                          shard_max, prefix_bounder<SBSubPrefix>),
    458         std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
    459                          shard_max, prefix_bounder<SBAddFullHash>),
    460         std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
    461                          shard_max, prefix_bounder<SBSubFullHash>));
    462   }
    463 
    464   // Write a shard header and data for the shard starting at |beg| and ending at
    465   // the element before |end|.
    466   bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
    467                   FILE* fp, base::MD5Context* context) {
    468     ShardHeader shard_header;
    469     shard_header.add_prefix_count =
    470         end.add_prefixes_iter_ - beg.add_prefixes_iter_;
    471     shard_header.sub_prefix_count =
    472         end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
    473     shard_header.add_hash_count =
    474         end.add_hashes_iter_ - beg.add_hashes_iter_;
    475     shard_header.sub_hash_count =
    476         end.sub_hashes_iter_ - beg.sub_hashes_iter_;
    477 
    478     return
    479         WriteItem(shard_header, fp, context) &&
    480         WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
    481                    fp, context) &&
    482         WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
    483                    fp, context) &&
    484         WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
    485                    fp, context) &&
    486         WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
    487                    fp, context);
    488   }
    489 
    490   SBAddPrefixes add_prefixes_;
    491   SBSubPrefixes sub_prefixes_;
    492   std::vector<SBAddFullHash> add_full_hashes_;
    493   std::vector<SBSubFullHash> sub_full_hashes_;
    494 };
    495 
    496 // True if |val| is an even power of two.
    497 template <typename T>
    498 bool IsPowerOfTwo(const T& val) {
    499   return val && (val & (val - 1)) == 0;
    500 }
    501 
    502 // Helper to read the entire database state, used by GetAddPrefixes() and
    503 // GetAddFullHashes().  Those functions are generally used only for smaller
    504 // files.  Returns false in case of errors reading the data.
    505 bool ReadDbStateHelper(const base::FilePath& filename,
    506                        StateInternal* db_state) {
    507   base::ScopedFILE file(base::OpenFile(filename, "rb"));
    508   if (file.get() == NULL)
    509     return false;
    510 
    511   std::set<int32> add_chunks;
    512   std::set<int32> sub_chunks;
    513 
    514   base::MD5Context context;
    515   FileHeader header;
    516   const int version =
    517       ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
    518                           file.get(), &context);
    519   if (version == kInvalidVersion)
    520     return false;
    521 
    522   uint64 in_min = 0;
    523   uint64 in_stride = header.shard_stride;
    524   if (!in_stride)
    525     in_stride = kMaxShardStride;
    526   if (!IsPowerOfTwo(in_stride))
    527     return false;
    528 
    529   do {
    530     ShardHeader shard_header;
    531     if (!ReadItem(&shard_header, file.get(), &context))
    532       return false;
    533 
    534     if (!db_state->AppendData(shard_header.add_prefix_count,
    535                               shard_header.sub_prefix_count,
    536                               shard_header.add_hash_count,
    537                               shard_header.sub_hash_count,
    538                               file.get(), &context)) {
    539       return false;
    540     }
    541 
    542     in_min += in_stride;
    543   } while (in_min <= kMaxSBPrefix);
    544 
    545   if (!ReadAndVerifyChecksum(file.get(), &context))
    546     return false;
    547 
    548   int64 size = 0;
    549   if (!base::GetFileSize(filename, &size))
    550     return false;
    551 
    552   return static_cast<int64>(ftell(file.get())) == size;
    553 }
    554 
    555 }  // namespace
    556 
    557 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
    558     : chunks_written_(0), empty_(false), corruption_seen_(false) {}
    559 
    560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
    561   Close();
    562 }
    563 
    564 bool SafeBrowsingStoreFile::Delete() {
    565   // The database should not be open at this point.  But, just in
    566   // case, close everything before deleting.
    567   if (!Close()) {
    568     NOTREACHED();
    569     return false;
    570   }
    571 
    572   return DeleteStore(filename_);
    573 }
    574 
    575 bool SafeBrowsingStoreFile::CheckValidity() {
    576   // The file was either empty or never opened.  The empty case is
    577   // presumed not to be invalid.  The never-opened case can happen if
    578   // BeginUpdate() fails for any databases, and should already have
    579   // caused the corruption callback to fire.
    580   if (!file_.get())
    581     return true;
    582 
    583   if (!FileRewind(file_.get()))
    584     return OnCorruptDatabase();
    585 
    586   int64 size = 0;
    587   if (!base::GetFileSize(filename_, &size))
    588     return OnCorruptDatabase();
    589 
    590   base::MD5Context context;
    591   base::MD5Init(&context);
    592 
    593   // Read everything except the final digest.
    594   size_t bytes_left = static_cast<size_t>(size);
    595   CHECK(size == static_cast<int64>(bytes_left));
    596   if (bytes_left < sizeof(base::MD5Digest))
    597     return OnCorruptDatabase();
    598   bytes_left -= sizeof(base::MD5Digest);
    599 
    600   // Fold the contents of the file into the checksum.
    601   while (bytes_left > 0) {
    602     char buf[4096];
    603     const size_t c = std::min(sizeof(buf), bytes_left);
    604     const size_t ret = fread(buf, 1, c, file_.get());
    605 
    606     // The file's size changed while reading, give up.
    607     if (ret != c)
    608       return OnCorruptDatabase();
    609     base::MD5Update(&context, base::StringPiece(buf, c));
    610     bytes_left -= c;
    611   }
    612 
    613   if (!ReadAndVerifyChecksum(file_.get(), &context)) {
    614     RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
    615     return OnCorruptDatabase();
    616   }
    617 
    618   return true;
    619 }
    620 
    621 void SafeBrowsingStoreFile::Init(
    622     const base::FilePath& filename,
    623     const base::Closure& corruption_callback
    624 ) {
    625   filename_ = filename;
    626   corruption_callback_ = corruption_callback;
    627 }
    628 
    629 bool SafeBrowsingStoreFile::BeginChunk() {
    630   return ClearChunkBuffers();
    631 }
    632 
    633 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
    634   add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
    635   return true;
    636 }
    637 
    638 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
    639   add_prefixes->clear();
    640   if (!base::PathExists(filename_))
    641     return true;
    642 
    643   StateInternal db_state;
    644   if (!ReadDbStateHelper(filename_, &db_state))
    645     return OnCorruptDatabase();
    646 
    647   add_prefixes->swap(db_state.add_prefixes_);
    648   return true;
    649 }
    650 
    651 bool SafeBrowsingStoreFile::GetAddFullHashes(
    652     std::vector<SBAddFullHash>* add_full_hashes) {
    653   add_full_hashes->clear();
    654   if (!base::PathExists(filename_))
    655     return true;
    656 
    657   StateInternal db_state;
    658   if (!ReadDbStateHelper(filename_, &db_state))
    659     return OnCorruptDatabase();
    660 
    661   add_full_hashes->swap(db_state.add_full_hashes_);
    662   return true;
    663 }
    664 
    665 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
    666                                          const SBFullHash& full_hash) {
    667   add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
    668   return true;
    669 }
    670 
    671 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
    672                                            int32 add_chunk_id,
    673                                            SBPrefix prefix) {
    674   sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
    675   return true;
    676 }
    677 
    678 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
    679                                          const SBFullHash& full_hash) {
    680   sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
    681   return true;
    682 }
    683 
    684 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
    685   if (!corruption_seen_)
    686     RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
    687   corruption_seen_ = true;
    688 
    689   corruption_callback_.Run();
    690 
    691   // Return false as a convenience to callers.
    692   return false;
    693 }
    694 
    695 bool SafeBrowsingStoreFile::Close() {
    696   ClearUpdateBuffers();
    697 
    698   // Make sure the files are closed.
    699   file_.reset();
    700   new_file_.reset();
    701   return true;
    702 }
    703 
    704 bool SafeBrowsingStoreFile::BeginUpdate() {
    705   DCHECK(!file_.get() && !new_file_.get());
    706 
    707   // Structures should all be clear unless something bad happened.
    708   DCHECK(add_chunks_cache_.empty());
    709   DCHECK(sub_chunks_cache_.empty());
    710   DCHECK(add_del_cache_.empty());
    711   DCHECK(sub_del_cache_.empty());
    712   DCHECK(add_prefixes_.empty());
    713   DCHECK(sub_prefixes_.empty());
    714   DCHECK(add_hashes_.empty());
    715   DCHECK(sub_hashes_.empty());
    716   DCHECK_EQ(chunks_written_, 0);
    717 
    718   corruption_seen_ = false;
    719 
    720   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
    721   base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
    722   if (new_file.get() == NULL)
    723     return false;
    724 
    725   base::ScopedFILE file(base::OpenFile(filename_, "rb"));
    726   empty_ = (file.get() == NULL);
    727   if (empty_) {
    728     // If the file exists but cannot be opened, try to delete it (not
    729     // deleting directly, the bloom filter needs to be deleted, too).
    730     if (base::PathExists(filename_))
    731       return OnCorruptDatabase();
    732 
    733     new_file_.swap(new_file);
    734     return true;
    735   }
    736 
    737   base::MD5Context context;
    738   FileHeader header;
    739   const int version =
    740       ReadAndVerifyHeader(filename_, &header,
    741                           &add_chunks_cache_, &sub_chunks_cache_,
    742                           file.get(), &context);
    743   if (version == kInvalidVersion) {
    744     FileHeader retry_header;
    745     if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
    746       if (retry_header.magic == kFileMagic &&
    747           retry_header.version < kFileVersion) {
    748         RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
    749       } else {
    750         RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
    751       }
    752     }
    753 
    754     // Close the file so that it can be deleted.
    755     file.reset();
    756 
    757     return OnCorruptDatabase();
    758   }
    759 
    760   file_.swap(file);
    761   new_file_.swap(new_file);
    762   return true;
    763 }
    764 
    765 bool SafeBrowsingStoreFile::FinishChunk() {
    766   if (!add_prefixes_.size() && !sub_prefixes_.size() &&
    767       !add_hashes_.size() && !sub_hashes_.size())
    768     return true;
    769 
    770   ChunkHeader header;
    771   header.add_prefix_count = add_prefixes_.size();
    772   header.sub_prefix_count = sub_prefixes_.size();
    773   header.add_hash_count = add_hashes_.size();
    774   header.sub_hash_count = sub_hashes_.size();
    775   if (!WriteItem(header, new_file_.get(), NULL))
    776     return false;
    777 
    778   if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
    779       !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
    780       !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
    781       !WriteContainer(sub_hashes_, new_file_.get(), NULL))
    782     return false;
    783 
    784   ++chunks_written_;
    785 
    786   // Clear everything to save memory.
    787   return ClearChunkBuffers();
    788 }
    789 
    790 bool SafeBrowsingStoreFile::DoUpdate(
    791     safe_browsing::PrefixSetBuilder* builder,
    792     std::vector<SBAddFullHash>* add_full_hashes_result) {
    793   DCHECK(file_.get() || empty_);
    794   DCHECK(new_file_.get());
    795   CHECK(builder);
    796   CHECK(add_full_hashes_result);
    797 
    798   // Rewind the temporary storage.
    799   if (!FileRewind(new_file_.get()))
    800     return false;
    801 
    802   // Get chunk file's size for validating counts.
    803   int64 update_size = 0;
    804   if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
    805     return OnCorruptDatabase();
    806 
    807   // Track update size to answer questions at http://crbug.com/72216 .
    808   // Log small updates as 1k so that the 0 (underflow) bucket can be
    809   // used for "empty" in SafeBrowsingDatabase.
    810   UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
    811                        std::max(static_cast<int>(update_size / 1024), 1));
    812 
    813   // Chunk updates to integrate.
    814   StateInternal new_state;
    815 
    816   // Read update chunks.
    817   for (int i = 0; i < chunks_written_; ++i) {
    818     ChunkHeader header;
    819 
    820     int64 ofs = ftell(new_file_.get());
    821     if (ofs == -1)
    822       return false;
    823 
    824     if (!ReadItem(&header, new_file_.get(), NULL))
    825       return false;
    826 
    827     // As a safety measure, make sure that the header describes a sane
    828     // chunk, given the remaining file size.
    829     int64 expected_size = ofs + sizeof(ChunkHeader);
    830     expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
    831     expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
    832     expected_size += header.add_hash_count * sizeof(SBAddFullHash);
    833     expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
    834     if (expected_size > update_size)
    835       return false;
    836 
    837     if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
    838                               header.add_hash_count, header.sub_hash_count,
    839                               new_file_.get(), NULL)) {
    840       return false;
    841     }
    842   }
    843 
    844   // The state was accumulated by chunk, sort by prefix.
    845   new_state.SortData();
    846 
    847   // These strides control how much data is loaded into memory per pass.
    848   // Strides must be an even power of two.  |in_stride| will be derived from the
    849   // input file.  |out_stride| will be derived from an estimate of the resulting
    850   // file's size.  |process_stride| will be the max of both.
    851   uint64 in_stride = kMaxShardStride;
    852   uint64 out_stride = kMaxShardStride;
    853   uint64 process_stride = 0;
    854 
    855   // Used to verify the input's checksum if |!empty_|.
    856   base::MD5Context in_context;
    857 
    858   if (!empty_) {
    859     DCHECK(file_.get());
    860 
    861     FileHeader header = {0};
    862     int version = ReadAndVerifyHeader(filename_, &header,
    863                                       &add_chunks_cache_, &sub_chunks_cache_,
    864                                       file_.get(), &in_context);
    865     if (version == kInvalidVersion)
    866       return OnCorruptDatabase();
    867 
    868     if (header.shard_stride)
    869       in_stride = header.shard_stride;
    870 
    871     // The header checksum should have prevented this case, but the code will be
    872     // broken if this is not correct.
    873     if (!IsPowerOfTwo(in_stride))
    874       return OnCorruptDatabase();
    875   }
    876 
    877   // We no longer need to track deleted chunks.
    878   DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
    879   DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
    880 
    881   // Calculate |out_stride| to break the file down into reasonable shards.
    882   {
    883     int64 original_size = 0;
    884     if (!empty_ && !base::GetFileSize(filename_, &original_size))
    885       return OnCorruptDatabase();
    886 
    887     // Approximate the final size as everything.  Subs and deletes will reduce
    888     // the size, but modest over-sharding won't hurt much.
    889     int64 shard_size = original_size + update_size;
    890 
    891     // Keep splitting until a single stride of data fits the target.
    892     size_t shifts = 0;
    893     while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
    894       out_stride >>= 1;
    895       shard_size >>= 1;
    896       ++shifts;
    897     }
    898     UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
    899 
    900     DCHECK(IsPowerOfTwo(out_stride));
    901   }
    902 
    903   // Outer loop strides by the max of the input stride (to read integral shards)
    904   // and the output stride (to write integral shards).
    905   process_stride = std::max(in_stride, out_stride);
    906   DCHECK(IsPowerOfTwo(process_stride));
    907   DCHECK_EQ(0u, process_stride % in_stride);
    908   DCHECK_EQ(0u, process_stride % out_stride);
    909 
    910   // Start writing the new data to |new_file_|.
    911   base::MD5Context out_context;
    912   if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
    913                    new_file_.get(), &out_context)) {
    914     return false;
    915   }
    916 
    917   // Start at the beginning of the SBPrefix space.
    918   uint64 in_min = 0;
    919   uint64 out_min = 0;
    920   uint64 process_min = 0;
    921 
    922   // Start at the beginning of the updates.
    923   StateInternalPos new_pos = new_state.StateBegin();
    924 
    925   // Re-usable container for shard processing.
    926   StateInternal db_state;
    927 
    928   // Track aggregate counts for histograms.
    929   size_t add_prefix_count = 0;
    930   size_t sub_prefix_count = 0;
    931 
    932   do {
    933     // Maximum element in the current shard.
    934     SBPrefix process_max =
    935         static_cast<SBPrefix>(process_min + process_stride - 1);
    936     DCHECK_GT(process_max, process_min);
    937 
    938     // Drop the data from previous pass.
    939     db_state.ClearData();
    940 
    941     // Fill the processing shard with one or more input shards.
    942     if (!empty_) {
    943       do {
    944         ShardHeader shard_header;
    945         if (!ReadItem(&shard_header, file_.get(), &in_context))
    946           return OnCorruptDatabase();
    947 
    948         if (!db_state.AppendData(shard_header.add_prefix_count,
    949                                  shard_header.sub_prefix_count,
    950                                  shard_header.add_hash_count,
    951                                  shard_header.sub_hash_count,
    952                                  file_.get(), &in_context))
    953           return OnCorruptDatabase();
    954 
    955         in_min += in_stride;
    956       } while (in_min <= kMaxSBPrefix && in_min < process_max);
    957     }
    958 
    959     // Shard the update data to match the database data, then merge the update
    960     // data and process the results.
    961     {
    962       StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
    963       db_state.MergeDataAndProcess(new_pos, new_end,
    964                                    add_del_cache_, sub_del_cache_);
    965       new_pos = new_end;
    966     }
    967 
    968     // Collect the processed data for return to caller.
    969     for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
    970       builder->AddPrefix(db_state.add_prefixes_[i].prefix);
    971     }
    972     add_full_hashes_result->insert(add_full_hashes_result->end(),
    973                                    db_state.add_full_hashes_.begin(),
    974                                    db_state.add_full_hashes_.end());
    975     add_prefix_count += db_state.add_prefixes_.size();
    976     sub_prefix_count += db_state.sub_prefixes_.size();
    977 
    978     // Write one or more shards of processed output.
    979     StateInternalPos out_pos = db_state.StateBegin();
    980     do {
    981       SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
    982       DCHECK_GT(out_max, out_min);
    983 
    984       StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
    985       if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
    986         return false;
    987       out_pos = out_end;
    988 
    989       out_min += out_stride;
    990     } while (out_min == static_cast<SBPrefix>(out_min) &&
    991              out_min < process_max);
    992 
    993     process_min += process_stride;
    994   } while (process_min <= kMaxSBPrefix);
    995 
    996   // Verify the overall checksum.
    997   if (!empty_) {
    998     if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
    999       RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
   1000       return OnCorruptDatabase();
   1001     }
   1002 
   1003     // TODO(shess): Verify EOF?
   1004 
   1005     // Close the input file so the new file can be renamed over it.
   1006     file_.reset();
   1007   }
   1008   DCHECK(!file_.get());
   1009 
   1010   // Write the overall checksum.
   1011   base::MD5Digest out_digest;
   1012   base::MD5Final(&out_digest, &out_context);
   1013   if (!WriteItem(out_digest, new_file_.get(), NULL))
   1014     return false;
   1015 
   1016   // Trim any excess left over from the temporary chunk data.
   1017   if (!base::TruncateFile(new_file_.get()))
   1018     return false;
   1019 
   1020   // Close the file handle and swizzle the file into place.
   1021   new_file_.reset();
   1022   if (!base::DeleteFile(filename_, false) &&
   1023       base::PathExists(filename_))
   1024     return false;
   1025 
   1026   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
   1027   if (!base::Move(new_filename, filename_))
   1028     return false;
   1029 
   1030   // Record counts before swapping to caller.
   1031   UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
   1032   UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
   1033 
   1034   return true;
   1035 }
   1036 
   1037 bool SafeBrowsingStoreFile::FinishUpdate(
   1038     safe_browsing::PrefixSetBuilder* builder,
   1039     std::vector<SBAddFullHash>* add_full_hashes_result) {
   1040   DCHECK(builder);
   1041   DCHECK(add_full_hashes_result);
   1042 
   1043   if (!DoUpdate(builder, add_full_hashes_result)) {
   1044     CancelUpdate();
   1045     return false;
   1046   }
   1047 
   1048   DCHECK(!new_file_.get());
   1049   DCHECK(!file_.get());
   1050 
   1051   return Close();
   1052 }
   1053 
   1054 bool SafeBrowsingStoreFile::CancelUpdate() {
   1055   bool ret = Close();
   1056 
   1057   // Delete stale staging file.
   1058   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
   1059   base::DeleteFile(new_filename, false);
   1060 
   1061   return ret;
   1062 }
   1063 
   1064 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
   1065   add_chunks_cache_.insert(chunk_id);
   1066 }
   1067 
   1068 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
   1069   return add_chunks_cache_.count(chunk_id) > 0;
   1070 }
   1071 
   1072 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
   1073   out->clear();
   1074   out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
   1075 }
   1076 
   1077 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
   1078   sub_chunks_cache_.insert(chunk_id);
   1079 }
   1080 
   1081 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
   1082   return sub_chunks_cache_.count(chunk_id) > 0;
   1083 }
   1084 
   1085 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
   1086   out->clear();
   1087   out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
   1088 }
   1089 
   1090 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
   1091   add_del_cache_.insert(chunk_id);
   1092 }
   1093 
   1094 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
   1095   sub_del_cache_.insert(chunk_id);
   1096 }
   1097 
   1098 // static
   1099 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
   1100   if (!base::DeleteFile(basename, false) &&
   1101       base::PathExists(basename)) {
   1102     NOTREACHED();
   1103     return false;
   1104   }
   1105 
   1106   const base::FilePath new_filename = TemporaryFileForFilename(basename);
   1107   if (!base::DeleteFile(new_filename, false) &&
   1108       base::PathExists(new_filename)) {
   1109     NOTREACHED();
   1110     return false;
   1111   }
   1112 
   1113   // With SQLite support gone, one way to get to this code is if the
   1114   // existing file is a SQLite file.  Make sure the journal file is
   1115   // also removed.
   1116   const base::FilePath journal_filename(
   1117       basename.value() + FILE_PATH_LITERAL("-journal"));
   1118   if (base::PathExists(journal_filename))
   1119     base::DeleteFile(journal_filename, false);
   1120 
   1121   return true;
   1122 }
   1123