1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/safe_browsing_store.h" 6 7 #include <algorithm> 8 9 #include "base/metrics/histogram.h" 10 11 namespace { 12 13 // Find items matching between |subs| and |adds|, and remove them, 14 // recording the item from |adds| in |adds_removed|. To minimize 15 // copies, the inputs are processing in parallel, so |subs| and |adds| 16 // should be compatibly ordered (either by SBAddPrefixLess or 17 // SBAddPrefixHashLess). 18 // 19 // |predAS| provides add < sub, |predSA| provides sub < add, for the 20 // tightest compare appropriate (see calls in SBProcessSubs). 21 template <class S, class A, typename PredAS, typename PredSA> 22 void KnockoutSubs(std::vector<S>* subs, 23 std::vector<A>* adds, 24 PredAS predAS, PredSA predSA, 25 std::vector<A>* adds_removed) { 26 // Keep a pair of output iterators for writing kept items. Due to 27 // deletions, these may lag the main iterators. Using erase() on 28 // individual items would result in O(N^2) copies. Using std::list 29 // would work around that, at double or triple the memory cost. 30 typename std::vector<A>::iterator add_out = adds->begin(); 31 typename std::vector<S>::iterator sub_out = subs->begin(); 32 33 // Current location in vectors. 34 // TODO(shess): I want these to be const_iterator, but then 35 // std::copy() gets confused. Could snag a const_iterator add_end, 36 // or write an inline std::copy(), but it seems like I'm doing 37 // something wrong. 38 typename std::vector<A>::iterator add_iter = adds->begin(); 39 typename std::vector<S>::iterator sub_iter = subs->begin(); 40 41 while (add_iter != adds->end() && sub_iter != subs->end()) { 42 // If |*sub_iter| < |*add_iter|, retain the sub. 43 if (predSA(*sub_iter, *add_iter)) { 44 *sub_out = *sub_iter; 45 ++sub_out; 46 ++sub_iter; 47 48 // If |*add_iter| < |*sub_iter|, retain the add. 49 } else if (predAS(*add_iter, *sub_iter)) { 50 *add_out = *add_iter; 51 ++add_out; 52 ++add_iter; 53 54 // Record equal items and drop them. 55 } else { 56 adds_removed->push_back(*add_iter); 57 ++add_iter; 58 ++sub_iter; 59 } 60 } 61 62 // Erase any leftover gap. 63 adds->erase(add_out, add_iter); 64 subs->erase(sub_out, sub_iter); 65 } 66 67 // Remove items in |removes| from |full_hashes|. |full_hashes| and 68 // |removes| should be ordered by SBAddPrefix component. 69 template <class T> 70 void RemoveMatchingPrefixes(const std::vector<SBAddPrefix>& removes, 71 std::vector<T>* full_hashes) { 72 // This is basically an inline of std::set_difference(). 73 // Unfortunately, that algorithm requires that the two iterator 74 // pairs use the same value types. 75 76 // Where to store kept items. 77 typename std::vector<T>::iterator out = full_hashes->begin(); 78 79 typename std::vector<T>::iterator hash_iter = full_hashes->begin(); 80 std::vector<SBAddPrefix>::const_iterator remove_iter = removes.begin(); 81 82 while (hash_iter != full_hashes->end() && remove_iter != removes.end()) { 83 // Keep items less than |*remove_iter|. 84 if (SBAddPrefixLess(*hash_iter, *remove_iter)) { 85 *out = *hash_iter; 86 ++out; 87 ++hash_iter; 88 89 // No hit for |*remove_iter|, bump it forward. 90 } else if (SBAddPrefixLess(*remove_iter, *hash_iter)) { 91 ++remove_iter; 92 93 // Drop equal items, there may be multiple hits. 94 } else { 95 do { 96 ++hash_iter; 97 } while (hash_iter != full_hashes->end() && 98 !SBAddPrefixLess(*remove_iter, *hash_iter)); 99 ++remove_iter; 100 } 101 } 102 103 // Erase any leftover gap. 104 full_hashes->erase(out, hash_iter); 105 } 106 107 // Remove deleted items (|chunk_id| in |del_set|) from the vector. 108 template <class T> 109 void RemoveDeleted(std::vector<T>* vec, const base::hash_set<int32>& del_set) { 110 DCHECK(vec); 111 112 // Scan through the items read, dropping the items in |del_set|. 113 typename std::vector<T>::iterator add_iter = vec->begin(); 114 for (typename std::vector<T>::iterator iter = add_iter; 115 iter != vec->end(); ++iter) { 116 if (del_set.count(iter->chunk_id) == 0) { 117 *add_iter = *iter; 118 ++add_iter; 119 } 120 } 121 vec->erase(add_iter, vec->end()); 122 } 123 124 enum MissTypes { 125 MISS_TYPE_ALL, 126 MISS_TYPE_FALSE, 127 128 // Always at the end. 129 MISS_TYPE_MAX 130 }; 131 132 } // namespace 133 134 void SBCheckPrefixMisses(const std::vector<SBAddPrefix>& add_prefixes, 135 const std::set<SBPrefix>& prefix_misses) { 136 if (prefix_misses.empty()) 137 return; 138 139 // Record a hit for all prefixes which missed when sent to the 140 // server. 141 for (size_t i = 0; i < prefix_misses.size(); ++i) { 142 UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives", 143 MISS_TYPE_ALL, MISS_TYPE_MAX); 144 } 145 146 // Collect the misses which are not present in |add_prefixes|. 147 // Since |add_prefixes| can contain multiple copies of the same 148 // prefix, it is not sufficient to count the number of elements 149 // present in both collections. 150 std::set<SBPrefix> false_misses(prefix_misses.begin(), prefix_misses.end()); 151 for (size_t i = 0; i < add_prefixes.size(); ++i) { 152 // |erase()| on an absent element should cost like |find()|. 153 false_misses.erase(add_prefixes[i].prefix); 154 } 155 156 // Record a hit for prefixes which we shouldn't have sent in the 157 // first place. 158 for (size_t i = 0; i < false_misses.size(); ++i) { 159 UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives", 160 MISS_TYPE_FALSE, MISS_TYPE_MAX); 161 } 162 163 // Divide |MISS_TYPE_FALSE| by |MISS_TYPE_ALL| to get the 164 // bloom-filter false-positive rate. 165 } 166 167 void SBProcessSubs(std::vector<SBAddPrefix>* add_prefixes, 168 std::vector<SBSubPrefix>* sub_prefixes, 169 std::vector<SBAddFullHash>* add_full_hashes, 170 std::vector<SBSubFullHash>* sub_full_hashes, 171 const base::hash_set<int32>& add_chunks_deleted, 172 const base::hash_set<int32>& sub_chunks_deleted) { 173 // It is possible to structure templates and template 174 // specializations such that the following calls work without having 175 // to qualify things. It becomes very arbitrary, though, and less 176 // clear how things are working. 177 178 // Sort the inputs by the SBAddPrefix bits. 179 std::sort(add_prefixes->begin(), add_prefixes->end(), 180 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); 181 std::sort(sub_prefixes->begin(), sub_prefixes->end(), 182 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); 183 std::sort(add_full_hashes->begin(), add_full_hashes->end(), 184 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); 185 std::sort(sub_full_hashes->begin(), sub_full_hashes->end(), 186 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); 187 188 // Factor out the prefix subs. 189 std::vector<SBAddPrefix> removed_adds; 190 KnockoutSubs(sub_prefixes, add_prefixes, 191 SBAddPrefixLess<SBAddPrefix,SBSubPrefix>, 192 SBAddPrefixLess<SBSubPrefix,SBAddPrefix>, 193 &removed_adds); 194 195 // Remove the full-hashes corrosponding to the adds which 196 // KnockoutSubs() removed. Processing these w/in KnockoutSubs() 197 // would make the code more complicated, and they are very small 198 // relative to the prefix lists so the gain would be modest. 199 RemoveMatchingPrefixes(removed_adds, add_full_hashes); 200 RemoveMatchingPrefixes(removed_adds, sub_full_hashes); 201 202 // http://crbug.com/52385 203 // TODO(shess): AFAICT this pass is not done on the trunk. I 204 // believe that's a bug, but it may not matter because full-hash 205 // subs almost never happen (I think you'd need multiple collisions 206 // where one of the sites stopped being flagged?). Enable this once 207 // everything is in. [if(0) instead of #ifdef 0 to make sure it 208 // compiles.] 209 if (0) { 210 // Factor out the full-hash subs. 211 std::vector<SBAddFullHash> removed_full_adds; 212 KnockoutSubs(sub_full_hashes, add_full_hashes, 213 SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>, 214 SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>, 215 &removed_full_adds); 216 } 217 218 // Remove items from the deleted chunks. This is done after other 219 // processing to allow subs to knock out adds (and be removed) even 220 // if the add's chunk is deleted. 221 RemoveDeleted(add_prefixes, add_chunks_deleted); 222 RemoveDeleted(sub_prefixes, sub_chunks_deleted); 223 RemoveDeleted(add_full_hashes, add_chunks_deleted); 224 RemoveDeleted(sub_full_hashes, sub_chunks_deleted); 225 } 226