Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/metrics/histogram.h"
     10 
     11 namespace {
     12 
     13 // Find items matching between |subs| and |adds|, and remove them,
     14 // recording the item from |adds| in |adds_removed|.  To minimize
     15 // copies, the inputs are processing in parallel, so |subs| and |adds|
     16 // should be compatibly ordered (either by SBAddPrefixLess or
     17 // SBAddPrefixHashLess).
     18 //
     19 // |predAS| provides add < sub, |predSA| provides sub < add, for the
     20 // tightest compare appropriate (see calls in SBProcessSubs).
     21 template <class S, class A, typename PredAS, typename PredSA>
     22 void KnockoutSubs(std::vector<S>* subs,
     23                   std::vector<A>* adds,
     24                   PredAS predAS, PredSA predSA,
     25                   std::vector<A>* adds_removed) {
     26   // Keep a pair of output iterators for writing kept items.  Due to
     27   // deletions, these may lag the main iterators.  Using erase() on
     28   // individual items would result in O(N^2) copies.  Using std::list
     29   // would work around that, at double or triple the memory cost.
     30   typename std::vector<A>::iterator add_out = adds->begin();
     31   typename std::vector<S>::iterator sub_out = subs->begin();
     32 
     33   // Current location in vectors.
     34   // TODO(shess): I want these to be const_iterator, but then
     35   // std::copy() gets confused.  Could snag a const_iterator add_end,
     36   // or write an inline std::copy(), but it seems like I'm doing
     37   // something wrong.
     38   typename std::vector<A>::iterator add_iter = adds->begin();
     39   typename std::vector<S>::iterator sub_iter = subs->begin();
     40 
     41   while (add_iter != adds->end() && sub_iter != subs->end()) {
     42     // If |*sub_iter| < |*add_iter|, retain the sub.
     43     if (predSA(*sub_iter, *add_iter)) {
     44       *sub_out = *sub_iter;
     45       ++sub_out;
     46       ++sub_iter;
     47 
     48       // If |*add_iter| < |*sub_iter|, retain the add.
     49     } else if (predAS(*add_iter, *sub_iter)) {
     50       *add_out = *add_iter;
     51       ++add_out;
     52       ++add_iter;
     53 
     54       // Record equal items and drop them.
     55     } else {
     56       adds_removed->push_back(*add_iter);
     57       ++add_iter;
     58       ++sub_iter;
     59     }
     60   }
     61 
     62   // Erase any leftover gap.
     63   adds->erase(add_out, add_iter);
     64   subs->erase(sub_out, sub_iter);
     65 }
     66 
     67 // Remove items in |removes| from |full_hashes|.  |full_hashes| and
     68 // |removes| should be ordered by SBAddPrefix component.
     69 template <class T>
     70 void RemoveMatchingPrefixes(const std::vector<SBAddPrefix>& removes,
     71                             std::vector<T>* full_hashes) {
     72   // This is basically an inline of std::set_difference().
     73   // Unfortunately, that algorithm requires that the two iterator
     74   // pairs use the same value types.
     75 
     76   // Where to store kept items.
     77   typename std::vector<T>::iterator out = full_hashes->begin();
     78 
     79   typename std::vector<T>::iterator hash_iter = full_hashes->begin();
     80   std::vector<SBAddPrefix>::const_iterator remove_iter = removes.begin();
     81 
     82   while (hash_iter != full_hashes->end() && remove_iter != removes.end()) {
     83     // Keep items less than |*remove_iter|.
     84     if (SBAddPrefixLess(*hash_iter, *remove_iter)) {
     85       *out = *hash_iter;
     86       ++out;
     87       ++hash_iter;
     88 
     89       // No hit for |*remove_iter|, bump it forward.
     90     } else if (SBAddPrefixLess(*remove_iter, *hash_iter)) {
     91       ++remove_iter;
     92 
     93       // Drop equal items, there may be multiple hits.
     94     } else {
     95       do {
     96         ++hash_iter;
     97       } while (hash_iter != full_hashes->end() &&
     98                !SBAddPrefixLess(*remove_iter, *hash_iter));
     99       ++remove_iter;
    100     }
    101   }
    102 
    103   // Erase any leftover gap.
    104   full_hashes->erase(out, hash_iter);
    105 }
    106 
    107 // Remove deleted items (|chunk_id| in |del_set|) from the vector.
    108 template <class T>
    109 void RemoveDeleted(std::vector<T>* vec, const base::hash_set<int32>& del_set) {
    110   DCHECK(vec);
    111 
    112   // Scan through the items read, dropping the items in |del_set|.
    113   typename std::vector<T>::iterator add_iter = vec->begin();
    114   for (typename std::vector<T>::iterator iter = add_iter;
    115        iter != vec->end(); ++iter) {
    116     if (del_set.count(iter->chunk_id) == 0) {
    117       *add_iter = *iter;
    118       ++add_iter;
    119     }
    120   }
    121   vec->erase(add_iter, vec->end());
    122 }
    123 
    124 enum MissTypes {
    125   MISS_TYPE_ALL,
    126   MISS_TYPE_FALSE,
    127 
    128   // Always at the end.
    129   MISS_TYPE_MAX
    130 };
    131 
    132 }  // namespace
    133 
    134 void SBCheckPrefixMisses(const std::vector<SBAddPrefix>& add_prefixes,
    135                          const std::set<SBPrefix>& prefix_misses) {
    136   if (prefix_misses.empty())
    137     return;
    138 
    139   // Record a hit for all prefixes which missed when sent to the
    140   // server.
    141   for (size_t i = 0; i < prefix_misses.size(); ++i) {
    142     UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
    143                               MISS_TYPE_ALL, MISS_TYPE_MAX);
    144   }
    145 
    146   // Collect the misses which are not present in |add_prefixes|.
    147   // Since |add_prefixes| can contain multiple copies of the same
    148   // prefix, it is not sufficient to count the number of elements
    149   // present in both collections.
    150   std::set<SBPrefix> false_misses(prefix_misses.begin(), prefix_misses.end());
    151   for (size_t i = 0; i < add_prefixes.size(); ++i) {
    152     // |erase()| on an absent element should cost like |find()|.
    153     false_misses.erase(add_prefixes[i].prefix);
    154   }
    155 
    156   // Record a hit for prefixes which we shouldn't have sent in the
    157   // first place.
    158   for (size_t i = 0; i < false_misses.size(); ++i) {
    159     UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
    160                               MISS_TYPE_FALSE, MISS_TYPE_MAX);
    161   }
    162 
    163   // Divide |MISS_TYPE_FALSE| by |MISS_TYPE_ALL| to get the
    164   // bloom-filter false-positive rate.
    165 }
    166 
    167 void SBProcessSubs(std::vector<SBAddPrefix>* add_prefixes,
    168                    std::vector<SBSubPrefix>* sub_prefixes,
    169                    std::vector<SBAddFullHash>* add_full_hashes,
    170                    std::vector<SBSubFullHash>* sub_full_hashes,
    171                    const base::hash_set<int32>& add_chunks_deleted,
    172                    const base::hash_set<int32>& sub_chunks_deleted) {
    173   // It is possible to structure templates and template
    174   // specializations such that the following calls work without having
    175   // to qualify things.  It becomes very arbitrary, though, and less
    176   // clear how things are working.
    177 
    178   // Sort the inputs by the SBAddPrefix bits.
    179   std::sort(add_prefixes->begin(), add_prefixes->end(),
    180             SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
    181   std::sort(sub_prefixes->begin(), sub_prefixes->end(),
    182             SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
    183   std::sort(add_full_hashes->begin(), add_full_hashes->end(),
    184             SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
    185   std::sort(sub_full_hashes->begin(), sub_full_hashes->end(),
    186             SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
    187 
    188   // Factor out the prefix subs.
    189   std::vector<SBAddPrefix> removed_adds;
    190   KnockoutSubs(sub_prefixes, add_prefixes,
    191                SBAddPrefixLess<SBAddPrefix,SBSubPrefix>,
    192                SBAddPrefixLess<SBSubPrefix,SBAddPrefix>,
    193                &removed_adds);
    194 
    195   // Remove the full-hashes corrosponding to the adds which
    196   // KnockoutSubs() removed.  Processing these w/in KnockoutSubs()
    197   // would make the code more complicated, and they are very small
    198   // relative to the prefix lists so the gain would be modest.
    199   RemoveMatchingPrefixes(removed_adds, add_full_hashes);
    200   RemoveMatchingPrefixes(removed_adds, sub_full_hashes);
    201 
    202   // http://crbug.com/52385
    203   // TODO(shess): AFAICT this pass is not done on the trunk.  I
    204   // believe that's a bug, but it may not matter because full-hash
    205   // subs almost never happen (I think you'd need multiple collisions
    206   // where one of the sites stopped being flagged?).  Enable this once
    207   // everything is in.  [if(0) instead of #ifdef 0 to make sure it
    208   // compiles.]
    209   if (0) {
    210     // Factor out the full-hash subs.
    211     std::vector<SBAddFullHash> removed_full_adds;
    212     KnockoutSubs(sub_full_hashes, add_full_hashes,
    213                  SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>,
    214                  SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>,
    215                  &removed_full_adds);
    216   }
    217 
    218   // Remove items from the deleted chunks.  This is done after other
    219   // processing to allow subs to knock out adds (and be removed) even
    220   // if the add's chunk is deleted.
    221   RemoveDeleted(add_prefixes, add_chunks_deleted);
    222   RemoveDeleted(sub_prefixes, sub_chunks_deleted);
    223   RemoveDeleted(add_full_hashes, add_chunks_deleted);
    224   RemoveDeleted(sub_full_hashes, sub_chunks_deleted);
    225 }
    226