Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/prefix_set.h"
      6 
      7 #include <algorithm>
      8 #include <math.h>
      9 
     10 #include "base/file_util.h"
     11 #include "base/logging.h"
     12 #include "base/md5.h"
     13 #include "base/metrics/histogram.h"
     14 
     15 namespace {
     16 
     17 // |kMagic| should be reasonably unique, and not match itself across
     18 // endianness changes.  I generated this value with:
     19 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
     20 static uint32 kMagic = 0x864088dd;
     21 
     22 // Current version the code writes out.
     23 static uint32 kVersion = 0x1;
     24 
     25 typedef struct {
     26   uint32 magic;
     27   uint32 version;
     28   uint32 index_size;
     29   uint32 deltas_size;
     30 } FileHeader;
     31 
     32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
     33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a,
     34                 const std::pair<SBPrefix,size_t>& b) {
     35   return a.first < b.first;
     36 }
     37 
     38 }  // namespace
     39 
     40 namespace safe_browsing {
     41 
     42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) {
     43   if (sorted_prefixes.size()) {
     44     // Lead with the first prefix.
     45     SBPrefix prev_prefix = sorted_prefixes[0];
     46     size_t run_length = 0;
     47     index_.push_back(std::make_pair(prev_prefix, deltas_.size()));
     48 
     49     // Used to build a checksum from the data used to construct the
     50     // structures.  Since the data is a bunch of uniform hashes, it
     51     // seems reasonable to just xor most of it in, rather than trying
     52     // to use a more complicated algorithm.
     53     uint32 checksum = static_cast<uint32>(sorted_prefixes[0]);
     54     checksum ^= static_cast<uint32>(deltas_.size());
     55 
     56     for (size_t i = 1; i < sorted_prefixes.size(); ++i) {
     57       // Skip duplicates.
     58       if (sorted_prefixes[i] == prev_prefix)
     59         continue;
     60 
     61       // Calculate the delta.  |unsigned| is mandatory, because the
     62       // sorted_prefixes could be more than INT_MAX apart.
     63       DCHECK_GT(sorted_prefixes[i], prev_prefix);
     64       const unsigned delta = sorted_prefixes[i] - prev_prefix;
     65       const uint16 delta16 = static_cast<uint16>(delta);
     66 
     67       // New index ref if the delta doesn't fit, or if too many
     68       // consecutive deltas have been encoded.
     69       if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) {
     70         checksum ^= static_cast<uint32>(sorted_prefixes[i]);
     71         checksum ^= static_cast<uint32>(deltas_.size());
     72         index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size()));
     73         run_length = 0;
     74       } else {
     75         checksum ^= static_cast<uint32>(delta16);
     76         // Continue the run of deltas.
     77         deltas_.push_back(delta16);
     78         DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta);
     79         ++run_length;
     80       }
     81 
     82       prev_prefix = sorted_prefixes[i];
     83     }
     84     checksum_ = checksum;
     85     DCHECK(CheckChecksum());
     86 
     87     // Send up some memory-usage stats.  Bits because fractional bytes
     88     // are weird.
     89     const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT +
     90         deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT;
     91     const size_t unique_prefixes = index_.size() + deltas_.size();
     92     static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT;
     93     UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix",
     94                               bits_used / unique_prefixes,
     95                               kMaxBitsPerPrefix);
     96   }
     97 }
     98 
     99 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
    100                      std::vector<uint16> *deltas) {
    101   DCHECK(index && deltas);
    102   index_.swap(*index);
    103   deltas_.swap(*deltas);
    104 }
    105 
    106 PrefixSet::~PrefixSet() {}
    107 
    108 bool PrefixSet::Exists(SBPrefix prefix) const {
    109   if (index_.empty())
    110     return false;
    111 
    112   // Find the first position after |prefix| in |index_|.
    113   std::vector<std::pair<SBPrefix,size_t> >::const_iterator
    114       iter = std::upper_bound(index_.begin(), index_.end(),
    115                               std::pair<SBPrefix,size_t>(prefix, 0),
    116                               PrefixLess);
    117 
    118   // |prefix| comes before anything that's in the set.
    119   if (iter == index_.begin())
    120     return false;
    121 
    122   // Capture the upper bound of our target entry's deltas.
    123   const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
    124 
    125   // Back up to the entry our target is in.
    126   --iter;
    127 
    128   // All prefixes in |index_| are in the set.
    129   SBPrefix current = iter->first;
    130   if (current == prefix)
    131     return true;
    132 
    133   // Scan forward accumulating deltas while a match is possible.
    134   for (size_t di = iter->second; di < bound && current < prefix; ++di) {
    135     current += deltas_[di];
    136   }
    137 
    138   return current == prefix;
    139 }
    140 
    141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
    142   for (size_t ii = 0; ii < index_.size(); ++ii) {
    143     // The deltas for this |index_| entry run to the next index entry,
    144     // or the end of the deltas.
    145     const size_t deltas_end =
    146         (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
    147 
    148     SBPrefix current = index_[ii].first;
    149     prefixes->push_back(current);
    150     for (size_t di = index_[ii].second; di < deltas_end; ++di) {
    151       current += deltas_[di];
    152       prefixes->push_back(current);
    153     }
    154   }
    155 }
    156 
    157 // static
    158 PrefixSet* PrefixSet::LoadFile(const FilePath& filter_name) {
    159   int64 size_64;
    160   if (!file_util::GetFileSize(filter_name, &size_64))
    161     return NULL;
    162   if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
    163     return NULL;
    164 
    165   file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb"));
    166   if (!file.get())
    167     return NULL;
    168 
    169   FileHeader header;
    170   size_t read = fread(&header, sizeof(header), 1, file.get());
    171   if (read != 1)
    172     return NULL;
    173 
    174   if (header.magic != kMagic || header.version != kVersion)
    175     return NULL;
    176 
    177   std::vector<std::pair<SBPrefix,size_t> > index;
    178   const size_t index_bytes = sizeof(index[0]) * header.index_size;
    179 
    180   std::vector<uint16> deltas;
    181   const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
    182 
    183   // Check for bogus sizes before allocating any space.
    184   const size_t expected_bytes =
    185       sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);
    186   if (static_cast<int64>(expected_bytes) != size_64)
    187     return NULL;
    188 
    189   // The file looks valid, start building the digest.
    190   MD5Context context;
    191   MD5Init(&context);
    192   MD5Update(&context, &header, sizeof(header));
    193 
    194   // Read the index vector.  Herb Sutter indicates that vectors are
    195   // guaranteed to be contiuguous, so reading to where element 0 lives
    196   // is valid.
    197   index.resize(header.index_size);
    198   read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
    199   if (read != index.size())
    200     return NULL;
    201   MD5Update(&context, &(index[0]), index_bytes);
    202 
    203   // Read vector of deltas.
    204   deltas.resize(header.deltas_size);
    205   read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
    206   if (read != deltas.size())
    207     return NULL;
    208   MD5Update(&context, &(deltas[0]), deltas_bytes);
    209 
    210   MD5Digest calculated_digest;
    211   MD5Final(&calculated_digest, &context);
    212 
    213   MD5Digest file_digest;
    214   read = fread(&file_digest, sizeof(file_digest), 1, file.get());
    215   if (read != 1)
    216     return NULL;
    217 
    218   if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
    219     return NULL;
    220 
    221   // Steals contents of |index| and |deltas| via swap().
    222   return new PrefixSet(&index, &deltas);
    223 }
    224 
    225 bool PrefixSet::WriteFile(const FilePath& filter_name) const {
    226   FileHeader header;
    227   header.magic = kMagic;
    228   header.version = kVersion;
    229   header.index_size = static_cast<uint32>(index_.size());
    230   header.deltas_size = static_cast<uint32>(deltas_.size());
    231 
    232   // Sanity check that the 32-bit values never mess things up.
    233   if (static_cast<size_t>(header.index_size) != index_.size() ||
    234       static_cast<size_t>(header.deltas_size) != deltas_.size()) {
    235     NOTREACHED();
    236     return false;
    237   }
    238 
    239   file_util::ScopedFILE file(file_util::OpenFile(filter_name, "wb"));
    240   if (!file.get())
    241     return false;
    242 
    243   MD5Context context;
    244   MD5Init(&context);
    245 
    246   // TODO(shess): The I/O code in safe_browsing_store_file.cc would
    247   // sure be useful about now.
    248   size_t written = fwrite(&header, sizeof(header), 1, file.get());
    249   if (written != 1)
    250     return false;
    251   MD5Update(&context, &header, sizeof(header));
    252 
    253   // As for reads, the standard guarantees the ability to access the
    254   // contents of the vector by a pointer to an element.
    255   const size_t index_bytes = sizeof(index_[0]) * index_.size();
    256   written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), file.get());
    257   if (written != index_.size())
    258     return false;
    259   MD5Update(&context, &(index_[0]), index_bytes);
    260 
    261   const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
    262   written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
    263                    file.get());
    264   if (written != deltas_.size())
    265     return false;
    266   MD5Update(&context, &(deltas_[0]), deltas_bytes);
    267 
    268   MD5Digest digest;
    269   MD5Final(&digest, &context);
    270   written = fwrite(&digest, sizeof(digest), 1, file.get());
    271   if (written != 1)
    272     return false;
    273 
    274   // TODO(shess): Can this code check that the close was successful?
    275   file.reset();
    276 
    277   return true;
    278 }
    279 
    280 size_t PrefixSet::IndexBinFor(size_t target_index) const {
    281   // The items in |index_| have the logical index of each previous
    282   // item in |index_| plus the count of deltas between the items.
    283   // Since the indices into |deltas_| are absolute, the logical index
    284   // is then the sum of the two indices.
    285   size_t lo = 0;
    286   size_t hi = index_.size();
    287 
    288   // Binary search because linear search was too slow (really, the
    289   // unit test sucked).  Inline because the elements can't be compared
    290   // in isolation (their position matters).
    291   while (hi - lo > 1) {
    292     const size_t i = (lo + hi) / 2;
    293 
    294     if (target_index < i + index_[i].second) {
    295       DCHECK_LT(i, hi);  // Always making progress.
    296       hi = i;
    297     } else {
    298       DCHECK_GT(i, lo);  // Always making progress.
    299       lo = i;
    300     }
    301   }
    302   return lo;
    303 }
    304 
    305 size_t PrefixSet::GetSize() const {
    306   return index_.size() + deltas_.size();
    307 }
    308 
    309 bool PrefixSet::IsDeltaAt(size_t target_index) const {
    310   CHECK_LT(target_index, GetSize());
    311 
    312   const size_t i = IndexBinFor(target_index);
    313   return target_index > i + index_[i].second;
    314 }
    315 
    316 uint16 PrefixSet::DeltaAt(size_t target_index) const {
    317   CHECK_LT(target_index, GetSize());
    318 
    319   // Find the |index_| entry which contains |target_index|.
    320   const size_t i = IndexBinFor(target_index);
    321 
    322   // Exactly on the |index_| entry means no delta.
    323   CHECK_GT(target_index, i + index_[i].second);
    324 
    325   // -i backs out the |index_| entries, -1 gets the delta that lead to
    326   // the value at |target_index|.
    327   CHECK_LT(target_index - i - 1, deltas_.size());
    328   return deltas_[target_index - i - 1];
    329 }
    330 
    331 bool PrefixSet::CheckChecksum() const {
    332   uint32 checksum = 0;
    333 
    334   for (size_t ii = 0; ii < index_.size(); ++ii) {
    335     checksum ^= static_cast<uint32>(index_[ii].first);
    336     checksum ^= static_cast<uint32>(index_[ii].second);
    337   }
    338 
    339   for (size_t di = 0; di < deltas_.size(); ++di) {
    340     checksum ^= static_cast<uint32>(deltas_[di]);
    341   }
    342 
    343   return checksum == checksum_;
    344 }
    345 
    346 }  // namespace safe_browsing
    347