Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/prefix_set.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/file_util.h"
     10 #include "base/files/scoped_file.h"
     11 #include "base/logging.h"
     12 #include "base/md5.h"
     13 #include "base/metrics/histogram.h"
     14 #include "base/metrics/sparse_histogram.h"
     15 
     16 namespace {
     17 
     18 // |kMagic| should be reasonably unique, and not match itself across
     19 // endianness changes.  I generated this value with:
     20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
     21 static uint32 kMagic = 0x864088dd;
     22 
     23 // Version history:
     24 // Version 1: b6cb7cfe/r74487 by shess (at) chromium.org on 2011-02-10
     25 // Version 2: 2b59b0a6/r253924 by shess (at) chromium.org on 2014-02-27
     26 // Version 3: ????????/r?????? by shess (at) chromium.org on 2014-04-??
     27 
     28 // Version 2 layout is identical to version 1.  The sort order of |index_|
     29 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
     30 // Version 3 adds storage for full hashes.
     31 static uint32 kVersion = 3;
     32 static uint32 kDeprecatedVersion = 1;  // And lower.
     33 
     34 typedef struct {
     35   uint32 magic;
     36   uint32 version;
     37   uint32 index_size;
     38   uint32 deltas_size;
     39 } FileHeader_v2;
     40 
     41 typedef struct {
     42   uint32 magic;
     43   uint32 version;
     44   uint32 index_size;
     45   uint32 deltas_size;
     46   uint32 full_hashes_size;
     47 } FileHeader;
     48 
     49 // Common std::vector<> implementations add capacity by multiplying from the
     50 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
     51 // amortized constant time.  This is not necessary for insert() at end(), but
     52 // AFAICT it seems true for some implementations.  SBPrefix values should
     53 // uniformly cover the 32-bit space, so the final size can be estimated given a
     54 // subset of the input.
     55 //
     56 // |kEstimateThreshold| is when estimates start converging.  Results are strong
     57 // starting around 1<<27.  1<<30 is chosen to prevent the degenerate case of
     58 // resizing capacity from >50% to 100%.
     59 //
     60 // TODO(shess): I'm sure there is math in the world to describe good settings
     61 // for estimating the size of a uniformly-distributed set of integers from a
     62 // sorted subset.  I do not have such math in me, so I assumed that my current
     63 // organic database of prefixes was scale-free, and wrote a script to see how
     64 // often given slop values would always suffice for given strides.  At 1<<30,
     65 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
     66 //
     67 // TODO(shess): A smaller threshold uses less transient space in reallocation.
     68 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc.  The cost
     69 // is that a smaller threshold needs more slop (locked down for the long term).
     70 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
     71 const SBPrefix kEstimateThreshold = 1 << 30;
     72 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
     73   // estimated_count / current_count == estimated_max / current_prefix
     74   // For large input sets, estimated_max of 2^32 is close enough.
     75   const size_t estimated_prefix_count = static_cast<size_t>(
     76       (static_cast<uint64>(current_count) << 32) / current_prefix);
     77 
     78   // The estimate has an error bar, if the final total is below the estimate, no
     79   // harm, but if it is above a capacity resize will happen at nearly 100%.  Add
     80   // some slop to make sure all cases are covered.
     81   return estimated_prefix_count + estimated_prefix_count / 100;
     82 }
     83 
     84 }  // namespace
     85 
     86 namespace safe_browsing {
     87 
     88 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
     89 // static
     90 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
     91   return a.first < b.first;
     92 }
     93 
     94 PrefixSet::PrefixSet() {
     95 }
     96 
     97 PrefixSet::PrefixSet(IndexVector* index,
     98                      std::vector<uint16>* deltas,
     99                      std::vector<SBFullHash>* full_hashes) {
    100   DCHECK(index && deltas && full_hashes);
    101   index_.swap(*index);
    102   deltas_.swap(*deltas);
    103   full_hashes_.swap(*full_hashes);
    104 }
    105 
    106 PrefixSet::~PrefixSet() {}
    107 
    108 bool PrefixSet::PrefixExists(SBPrefix prefix) const {
    109   if (index_.empty())
    110     return false;
    111 
    112   // Find the first position after |prefix| in |index_|.
    113   IndexVector::const_iterator iter =
    114       std::upper_bound(index_.begin(), index_.end(),
    115                        IndexPair(prefix, 0), PrefixLess);
    116 
    117   // |prefix| comes before anything that's in the set.
    118   if (iter == index_.begin())
    119     return false;
    120 
    121   // Capture the upper bound of our target entry's deltas.
    122   const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
    123 
    124   // Back up to the entry our target is in.
    125   --iter;
    126 
    127   // All prefixes in |index_| are in the set.
    128   SBPrefix current = iter->first;
    129   if (current == prefix)
    130     return true;
    131 
    132   // Scan forward accumulating deltas while a match is possible.
    133   for (size_t di = iter->second; di < bound && current < prefix; ++di) {
    134     current += deltas_[di];
    135   }
    136 
    137   return current == prefix;
    138 }
    139 
    140 bool PrefixSet::Exists(const SBFullHash& hash) const {
    141   if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
    142                          hash, SBFullHashLess)) {
    143     return true;
    144   }
    145   return PrefixExists(hash.prefix);
    146 }
    147 
    148 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
    149   prefixes->reserve(index_.size() + deltas_.size());
    150 
    151   for (size_t ii = 0; ii < index_.size(); ++ii) {
    152     // The deltas for this |index_| entry run to the next index entry,
    153     // or the end of the deltas.
    154     const size_t deltas_end =
    155         (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
    156 
    157     SBPrefix current = index_[ii].first;
    158     prefixes->push_back(current);
    159     for (size_t di = index_[ii].second; di < deltas_end; ++di) {
    160       current += deltas_[di];
    161       prefixes->push_back(current);
    162     }
    163   }
    164 }
    165 
    166 // static
    167 scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) {
    168   int64 size_64;
    169   if (!base::GetFileSize(filter_name, &size_64))
    170     return scoped_ptr<PrefixSet>();
    171   using base::MD5Digest;
    172   // TODO(shess): Revert to sizeof(FileHeader) for sanity check once v2 is
    173   // deprecated.
    174   if (size_64 < static_cast<int64>(sizeof(FileHeader_v2) + sizeof(MD5Digest)))
    175     return scoped_ptr<PrefixSet>();
    176 
    177   base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
    178   if (!file.get())
    179     return scoped_ptr<PrefixSet>();
    180 
    181   FileHeader header;
    182   size_t read = fread(&header, sizeof(header), 1, file.get());
    183   if (read != 1)
    184     return scoped_ptr<PrefixSet>();
    185 
    186   // The file looks valid, start building the digest.
    187   base::MD5Context context;
    188   base::MD5Init(&context);
    189   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
    190                                               sizeof(header)));
    191 
    192   if (header.magic != kMagic)
    193     return scoped_ptr<PrefixSet>();
    194 
    195   // Track version read to inform removal of support for older versions.
    196   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
    197 
    198   // TODO(shess): <http://crbug.com/368044> for removing v2 support.
    199   size_t header_size = sizeof(header);
    200   if (header.version <= kDeprecatedVersion) {
    201     return scoped_ptr<PrefixSet>();
    202   } else if (header.version == 2) {
    203     // Rewind the file and restart building the digest with the old header
    204     // structure.
    205     FileHeader_v2 v2_header;
    206     if (0 != fseek(file.get(), 0, SEEK_SET))
    207       return scoped_ptr<PrefixSet>();
    208 
    209     size_t read = fread(&v2_header, sizeof(v2_header), 1, file.get());
    210     if (read != 1)
    211       return scoped_ptr<PrefixSet>();
    212 
    213     base::MD5Init(&context);
    214     base::MD5Update(&context,
    215                     base::StringPiece(reinterpret_cast<char*>(&v2_header),
    216                                       sizeof(v2_header)));
    217 
    218     // The current header is a superset of the old header, fill it in with the
    219     // information read.
    220     header.magic = v2_header.magic;
    221     header.version = v2_header.version;
    222     header.index_size = v2_header.index_size;
    223     header.deltas_size = v2_header.deltas_size;
    224     header.full_hashes_size = 0;
    225     header_size = sizeof(v2_header);
    226   } else if (header.version != kVersion) {
    227     return scoped_ptr<PrefixSet>();
    228   }
    229 
    230   IndexVector index;
    231   const size_t index_bytes = sizeof(index[0]) * header.index_size;
    232 
    233   std::vector<uint16> deltas;
    234   const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
    235 
    236   std::vector<SBFullHash> full_hashes;
    237   const size_t full_hashes_bytes =
    238       sizeof(full_hashes[0]) * header.full_hashes_size;
    239 
    240   // Check for bogus sizes before allocating any space.
    241   const size_t expected_bytes = header_size +
    242       index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
    243   if (static_cast<int64>(expected_bytes) != size_64)
    244     return scoped_ptr<PrefixSet>();
    245 
    246   // Read the index vector.  Herb Sutter indicates that vectors are
    247   // guaranteed to be contiuguous, so reading to where element 0 lives
    248   // is valid.
    249   if (header.index_size) {
    250     index.resize(header.index_size);
    251     read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
    252     if (read != index.size())
    253       return scoped_ptr<PrefixSet>();
    254     base::MD5Update(&context,
    255                     base::StringPiece(reinterpret_cast<char*>(&(index[0])),
    256                                       index_bytes));
    257   }
    258 
    259   // Read vector of deltas.
    260   if (header.deltas_size) {
    261     deltas.resize(header.deltas_size);
    262     read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
    263     if (read != deltas.size())
    264       return scoped_ptr<PrefixSet>();
    265     base::MD5Update(&context,
    266                     base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
    267                                       deltas_bytes));
    268   }
    269 
    270   // Read vector of full hashes.
    271   if (header.full_hashes_size) {
    272     full_hashes.resize(header.full_hashes_size);
    273     read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
    274                  file.get());
    275     if (read != full_hashes.size())
    276       return scoped_ptr<PrefixSet>();
    277     base::MD5Update(&context,
    278                     base::StringPiece(
    279                         reinterpret_cast<char*>(&(full_hashes[0])),
    280                         full_hashes_bytes));
    281   }
    282 
    283   base::MD5Digest calculated_digest;
    284   base::MD5Final(&calculated_digest, &context);
    285 
    286   base::MD5Digest file_digest;
    287   read = fread(&file_digest, sizeof(file_digest), 1, file.get());
    288   if (read != 1)
    289     return scoped_ptr<PrefixSet>();
    290 
    291   if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
    292     return scoped_ptr<PrefixSet>();
    293 
    294   // Steals vector contents using swap().
    295   return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas, &full_hashes));
    296 }
    297 
    298 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
    299   FileHeader header;
    300   header.magic = kMagic;
    301   header.version = kVersion;
    302   header.index_size = static_cast<uint32>(index_.size());
    303   header.deltas_size = static_cast<uint32>(deltas_.size());
    304   header.full_hashes_size = static_cast<uint32>(full_hashes_.size());
    305 
    306   // Sanity check that the 32-bit values never mess things up.
    307   if (static_cast<size_t>(header.index_size) != index_.size() ||
    308       static_cast<size_t>(header.deltas_size) != deltas_.size() ||
    309       static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
    310     NOTREACHED();
    311     return false;
    312   }
    313 
    314   base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
    315   if (!file.get())
    316     return false;
    317 
    318   base::MD5Context context;
    319   base::MD5Init(&context);
    320 
    321   // TODO(shess): The I/O code in safe_browsing_store_file.cc would
    322   // sure be useful about now.
    323   size_t written = fwrite(&header, sizeof(header), 1, file.get());
    324   if (written != 1)
    325     return false;
    326   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
    327                                               sizeof(header)));
    328 
    329   // As for reads, the standard guarantees the ability to access the
    330   // contents of the vector by a pointer to an element.
    331   if (index_.size()) {
    332     const size_t index_bytes = sizeof(index_[0]) * index_.size();
    333     written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
    334                      file.get());
    335     if (written != index_.size())
    336       return false;
    337     base::MD5Update(&context,
    338                     base::StringPiece(
    339                         reinterpret_cast<const char*>(&(index_[0])),
    340                         index_bytes));
    341   }
    342 
    343   if (deltas_.size()) {
    344     const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
    345     written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
    346                      file.get());
    347     if (written != deltas_.size())
    348       return false;
    349     base::MD5Update(&context,
    350                     base::StringPiece(
    351                         reinterpret_cast<const char*>(&(deltas_[0])),
    352                         deltas_bytes));
    353   }
    354 
    355   if (full_hashes_.size()) {
    356     const size_t elt_size = sizeof(full_hashes_[0]);
    357     const size_t elts = full_hashes_.size();
    358     const size_t full_hashes_bytes = elt_size * elts;
    359     written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
    360     if (written != elts)
    361       return false;
    362     base::MD5Update(&context,
    363                     base::StringPiece(
    364                         reinterpret_cast<const char*>(&(full_hashes_[0])),
    365                         full_hashes_bytes));
    366   }
    367 
    368   base::MD5Digest digest;
    369   base::MD5Final(&digest, &context);
    370   written = fwrite(&digest, sizeof(digest), 1, file.get());
    371   if (written != 1)
    372     return false;
    373 
    374   // TODO(shess): Can this code check that the close was successful?
    375   file.reset();
    376 
    377   return true;
    378 }
    379 
    380 void PrefixSet::AddRun(SBPrefix index_prefix,
    381                        const uint16* run_begin, const uint16* run_end) {
    382   // Preempt organic capacity decisions for |delta_| once strong estimates can
    383   // be made.
    384   if (index_prefix > kEstimateThreshold &&
    385       deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
    386     deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
    387   }
    388 
    389   index_.push_back(std::make_pair(index_prefix, deltas_.size()));
    390   deltas_.insert(deltas_.end(), run_begin, run_end);
    391 }
    392 
    393 PrefixSetBuilder::PrefixSetBuilder()
    394     : prefix_set_(new PrefixSet()) {
    395 }
    396 
    397 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
    398     : prefix_set_(new PrefixSet()) {
    399   for (size_t i = 0; i < prefixes.size(); ++i) {
    400     AddPrefix(prefixes[i]);
    401   }
    402 }
    403 
    404 PrefixSetBuilder::~PrefixSetBuilder() {
    405 }
    406 
    407 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSet(
    408     const std::vector<SBFullHash>& hashes) {
    409   DCHECK(prefix_set_.get());
    410 
    411   // Flush runs until buffered data is gone.
    412   while (!buffer_.empty()) {
    413     EmitRun();
    414   }
    415 
    416   // Precisely size |index_| for read-only.  It's 50k-60k, so minor savings, but
    417   // they're almost free.
    418   PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
    419 
    420   prefix_set_->full_hashes_ = hashes;
    421   std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
    422             SBFullHashLess);
    423 
    424   return prefix_set_.Pass();
    425 }
    426 
    427 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
    428   return GetPrefixSet(std::vector<SBFullHash>()).Pass();
    429 }
    430 
    431 void PrefixSetBuilder::EmitRun() {
    432   DCHECK(prefix_set_.get());
    433 
    434   SBPrefix prev_prefix = buffer_[0];
    435   uint16 run[PrefixSet::kMaxRun];
    436   size_t run_pos = 0;
    437 
    438   size_t i;
    439   for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
    440     // Calculate the delta.  |unsigned| is mandatory, because the
    441     // sorted_prefixes could be more than INT_MAX apart.
    442     DCHECK_GT(buffer_[i], prev_prefix);
    443     const unsigned delta = buffer_[i] - prev_prefix;
    444     const uint16 delta16 = static_cast<uint16>(delta);
    445 
    446     // Break the run if the delta doesn't fit.
    447     if (delta != static_cast<unsigned>(delta16))
    448       break;
    449 
    450     // Continue the run of deltas.
    451     run[run_pos++] = delta16;
    452     DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
    453 
    454     prev_prefix = buffer_[i];
    455   }
    456   prefix_set_->AddRun(buffer_[0], run, run + run_pos);
    457   buffer_.erase(buffer_.begin(), buffer_.begin() + i);
    458 }
    459 
    460 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
    461   DCHECK(prefix_set_.get());
    462 
    463   if (buffer_.empty()) {
    464     DCHECK(prefix_set_->index_.empty());
    465     DCHECK(prefix_set_->deltas_.empty());
    466   } else {
    467     // Drop duplicates.
    468     if (buffer_.back() == prefix)
    469       return;
    470 
    471     DCHECK_LT(buffer_.back(), prefix);
    472   }
    473   buffer_.push_back(prefix);
    474 
    475   // Flush buffer when a run can be constructed.  +1 for the index item, and +1
    476   // to leave at least one item in the buffer for dropping duplicates.
    477   if (buffer_.size() > PrefixSet::kMaxRun + 2)
    478     EmitRun();
    479 }
    480 
    481 }  // namespace safe_browsing
    482