Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/prefix_set.h"
      6 
      7 #include <algorithm>
      8 #include <math.h>
      9 
     10 #include "base/file_util.h"
     11 #include "base/logging.h"
     12 #include "base/md5.h"
     13 #include "base/metrics/histogram.h"
     14 
     15 namespace {
     16 
     17 // |kMagic| should be reasonably unique, and not match itself across
     18 // endianness changes.  I generated this value with:
     19 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
     20 static uint32 kMagic = 0x864088dd;
     21 
     22 // Current version the code writes out.
     23 static uint32 kVersion = 0x1;
     24 
     25 typedef struct {
     26   uint32 magic;
     27   uint32 version;
     28   uint32 index_size;
     29   uint32 deltas_size;
     30 } FileHeader;
     31 
     32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
     33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a,
     34                 const std::pair<SBPrefix,size_t>& b) {
     35   return a.first < b.first;
     36 }
     37 
     38 }  // namespace
     39 
     40 namespace safe_browsing {
     41 
     42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) {
     43   if (sorted_prefixes.size()) {
     44     // Estimate the resulting vector sizes.  There will be strictly
     45     // more than |min_runs| entries in |index_|, but there generally
     46     // aren't many forced breaks.
     47     const size_t min_runs = sorted_prefixes.size() / kMaxRun;
     48     index_.reserve(min_runs);
     49     deltas_.reserve(sorted_prefixes.size() - min_runs);
     50 
     51     // Lead with the first prefix.
     52     SBPrefix prev_prefix = sorted_prefixes[0];
     53     size_t run_length = 0;
     54     index_.push_back(std::make_pair(prev_prefix, deltas_.size()));
     55 
     56     for (size_t i = 1; i < sorted_prefixes.size(); ++i) {
     57       // Skip duplicates.
     58       if (sorted_prefixes[i] == prev_prefix)
     59         continue;
     60 
     61       // Calculate the delta.  |unsigned| is mandatory, because the
     62       // sorted_prefixes could be more than INT_MAX apart.
     63       DCHECK_GT(sorted_prefixes[i], prev_prefix);
     64       const unsigned delta = sorted_prefixes[i] - prev_prefix;
     65       const uint16 delta16 = static_cast<uint16>(delta);
     66 
     67       // New index ref if the delta doesn't fit, or if too many
     68       // consecutive deltas have been encoded.
     69       if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) {
     70         index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size()));
     71         run_length = 0;
     72       } else {
     73         // Continue the run of deltas.
     74         deltas_.push_back(delta16);
     75         DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta);
     76         ++run_length;
     77       }
     78 
     79       prev_prefix = sorted_prefixes[i];
     80     }
     81 
     82     // Send up some memory-usage stats.  Bits because fractional bytes
     83     // are weird.
     84     const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT +
     85         deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT;
     86     const size_t unique_prefixes = index_.size() + deltas_.size();
     87     static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT;
     88     UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix",
     89                               bits_used / unique_prefixes,
     90                               kMaxBitsPerPrefix);
     91   }
     92 }
     93 
     94 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
     95                      std::vector<uint16> *deltas) {
     96   DCHECK(index && deltas);
     97   index_.swap(*index);
     98   deltas_.swap(*deltas);
     99 }
    100 
    101 PrefixSet::~PrefixSet() {}
    102 
    103 bool PrefixSet::Exists(SBPrefix prefix) const {
    104   if (index_.empty())
    105     return false;
    106 
    107   // Find the first position after |prefix| in |index_|.
    108   std::vector<std::pair<SBPrefix,size_t> >::const_iterator
    109       iter = std::upper_bound(index_.begin(), index_.end(),
    110                               std::pair<SBPrefix,size_t>(prefix, 0),
    111                               PrefixLess);
    112 
    113   // |prefix| comes before anything that's in the set.
    114   if (iter == index_.begin())
    115     return false;
    116 
    117   // Capture the upper bound of our target entry's deltas.
    118   const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
    119 
    120   // Back up to the entry our target is in.
    121   --iter;
    122 
    123   // All prefixes in |index_| are in the set.
    124   SBPrefix current = iter->first;
    125   if (current == prefix)
    126     return true;
    127 
    128   // Scan forward accumulating deltas while a match is possible.
    129   for (size_t di = iter->second; di < bound && current < prefix; ++di) {
    130     current += deltas_[di];
    131   }
    132 
    133   return current == prefix;
    134 }
    135 
    136 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
    137   prefixes->reserve(index_.size() + deltas_.size());
    138 
    139   for (size_t ii = 0; ii < index_.size(); ++ii) {
    140     // The deltas for this |index_| entry run to the next index entry,
    141     // or the end of the deltas.
    142     const size_t deltas_end =
    143         (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
    144 
    145     SBPrefix current = index_[ii].first;
    146     prefixes->push_back(current);
    147     for (size_t di = index_[ii].second; di < deltas_end; ++di) {
    148       current += deltas_[di];
    149       prefixes->push_back(current);
    150     }
    151   }
    152 }
    153 
    154 // static
    155 PrefixSet* PrefixSet::LoadFile(const base::FilePath& filter_name) {
    156   int64 size_64;
    157   if (!file_util::GetFileSize(filter_name, &size_64))
    158     return NULL;
    159   using base::MD5Digest;
    160   if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
    161     return NULL;
    162 
    163   file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb"));
    164   if (!file.get())
    165     return NULL;
    166 
    167   FileHeader header;
    168   size_t read = fread(&header, sizeof(header), 1, file.get());
    169   if (read != 1)
    170     return NULL;
    171 
    172   if (header.magic != kMagic || header.version != kVersion)
    173     return NULL;
    174 
    175   std::vector<std::pair<SBPrefix,size_t> > index;
    176   const size_t index_bytes = sizeof(index[0]) * header.index_size;
    177 
    178   std::vector<uint16> deltas;
    179   const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
    180 
    181   // Check for bogus sizes before allocating any space.
    182   const size_t expected_bytes =
    183       sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);
    184   if (static_cast<int64>(expected_bytes) != size_64)
    185     return NULL;
    186 
    187   // The file looks valid, start building the digest.
    188   base::MD5Context context;
    189   base::MD5Init(&context);
    190   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
    191                                               sizeof(header)));
    192 
    193   // Read the index vector.  Herb Sutter indicates that vectors are
    194   // guaranteed to be contiuguous, so reading to where element 0 lives
    195   // is valid.
    196   if (header.index_size) {
    197     index.resize(header.index_size);
    198     read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
    199     if (read != index.size())
    200       return NULL;
    201     base::MD5Update(&context,
    202                     base::StringPiece(reinterpret_cast<char*>(&(index[0])),
    203                                       index_bytes));
    204   }
    205 
    206   // Read vector of deltas.
    207   if (header.deltas_size) {
    208     deltas.resize(header.deltas_size);
    209     read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
    210     if (read != deltas.size())
    211       return NULL;
    212     base::MD5Update(&context,
    213                     base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
    214                                       deltas_bytes));
    215   }
    216 
    217   base::MD5Digest calculated_digest;
    218   base::MD5Final(&calculated_digest, &context);
    219 
    220   base::MD5Digest file_digest;
    221   read = fread(&file_digest, sizeof(file_digest), 1, file.get());
    222   if (read != 1)
    223     return NULL;
    224 
    225   if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
    226     return NULL;
    227 
    228   // Steals contents of |index| and |deltas| via swap().
    229   return new PrefixSet(&index, &deltas);
    230 }
    231 
    232 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
    233   FileHeader header;
    234   header.magic = kMagic;
    235   header.version = kVersion;
    236   header.index_size = static_cast<uint32>(index_.size());
    237   header.deltas_size = static_cast<uint32>(deltas_.size());
    238 
    239   // Sanity check that the 32-bit values never mess things up.
    240   if (static_cast<size_t>(header.index_size) != index_.size() ||
    241       static_cast<size_t>(header.deltas_size) != deltas_.size()) {
    242     NOTREACHED();
    243     return false;
    244   }
    245 
    246   file_util::ScopedFILE file(file_util::OpenFile(filter_name, "wb"));
    247   if (!file.get())
    248     return false;
    249 
    250   base::MD5Context context;
    251   base::MD5Init(&context);
    252 
    253   // TODO(shess): The I/O code in safe_browsing_store_file.cc would
    254   // sure be useful about now.
    255   size_t written = fwrite(&header, sizeof(header), 1, file.get());
    256   if (written != 1)
    257     return false;
    258   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
    259                                               sizeof(header)));
    260 
    261   // As for reads, the standard guarantees the ability to access the
    262   // contents of the vector by a pointer to an element.
    263   if (index_.size()) {
    264     const size_t index_bytes = sizeof(index_[0]) * index_.size();
    265     written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
    266                      file.get());
    267     if (written != index_.size())
    268       return false;
    269     base::MD5Update(&context,
    270                     base::StringPiece(
    271                         reinterpret_cast<const char*>(&(index_[0])),
    272                         index_bytes));
    273   }
    274 
    275   if (deltas_.size()) {
    276     const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
    277     written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
    278                      file.get());
    279     if (written != deltas_.size())
    280       return false;
    281     base::MD5Update(&context,
    282                     base::StringPiece(
    283                         reinterpret_cast<const char*>(&(deltas_[0])),
    284                         deltas_bytes));
    285   }
    286 
    287   base::MD5Digest digest;
    288   base::MD5Final(&digest, &context);
    289   written = fwrite(&digest, sizeof(digest), 1, file.get());
    290   if (written != 1)
    291     return false;
    292 
    293   // TODO(shess): Can this code check that the close was successful?
    294   file.reset();
    295 
    296   return true;
    297 }
    298 
    299 }  // namespace safe_browsing
    300