1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/prefix_set.h" 6 7 #include <algorithm> 8 9 #include "base/files/file_util.h" 10 #include "base/files/scoped_file.h" 11 #include "base/logging.h" 12 #include "base/md5.h" 13 #include "base/metrics/histogram.h" 14 #include "base/metrics/sparse_histogram.h" 15 16 namespace { 17 18 // |kMagic| should be reasonably unique, and not match itself across 19 // endianness changes. I generated this value with: 20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 21 static uint32 kMagic = 0x864088dd; 22 23 // Version history: 24 // Version 1: b6cb7cfe/r74487 by shess (at) chromium.org on 2011-02-10 25 // Version 2: 2b59b0a6/r253924 by shess (at) chromium.org on 2014-02-27 26 // Version 3: dd07faf5/r268145 by shess (at) chromium.org on 2014-05-05 27 28 // Version 2 layout is identical to version 1. The sort order of |index_| 29 // changed from |int32| to |uint32| to match the change of |SBPrefix|. 30 // Version 3 adds storage for full hashes. 31 static uint32 kVersion = 3; 32 static uint32 kDeprecatedVersion = 2; // And lower. 33 34 typedef struct { 35 uint32 magic; 36 uint32 version; 37 uint32 index_size; 38 uint32 deltas_size; 39 uint32 full_hashes_size; 40 } FileHeader; 41 42 // Common std::vector<> implementations add capacity by multiplying from the 43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in 44 // amortized constant time. This is not necessary for insert() at end(), but 45 // AFAICT it seems true for some implementations. SBPrefix values should 46 // uniformly cover the 32-bit space, so the final size can be estimated given a 47 // subset of the input. 48 // 49 // |kEstimateThreshold| is when estimates start converging. Results are strong 50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of 51 // resizing capacity from >50% to 100%. 52 // 53 // TODO(shess): I'm sure there is math in the world to describe good settings 54 // for estimating the size of a uniformly-distributed set of integers from a 55 // sorted subset. I do not have such math in me, so I assumed that my current 56 // organic database of prefixes was scale-free, and wrote a script to see how 57 // often given slop values would always suffice for given strides. At 1<<30, 58 // .5% slop was sufficient to cover all cases (though the code below uses 1%). 59 // 60 // TODO(shess): A smaller threshold uses less transient space in reallocation. 61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost 62 // is that a smaller threshold needs more slop (locked down for the long term). 63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. 64 const SBPrefix kEstimateThreshold = 1 << 30; 65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { 66 // estimated_count / current_count == estimated_max / current_prefix 67 // For large input sets, estimated_max of 2^32 is close enough. 68 const size_t estimated_prefix_count = static_cast<size_t>( 69 (static_cast<uint64>(current_count) << 32) / current_prefix); 70 71 // The estimate has an error bar, if the final total is below the estimate, no 72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add 73 // some slop to make sure all cases are covered. 74 return estimated_prefix_count + estimated_prefix_count / 100; 75 } 76 77 } // namespace 78 79 namespace safe_browsing { 80 81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. 82 // static 83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { 84 return a.first < b.first; 85 } 86 87 PrefixSet::PrefixSet() { 88 } 89 90 PrefixSet::PrefixSet(IndexVector* index, 91 std::vector<uint16>* deltas, 92 std::vector<SBFullHash>* full_hashes) { 93 DCHECK(index && deltas && full_hashes); 94 index_.swap(*index); 95 deltas_.swap(*deltas); 96 full_hashes_.swap(*full_hashes); 97 } 98 99 PrefixSet::~PrefixSet() {} 100 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { 102 if (index_.empty()) 103 return false; 104 105 // Find the first position after |prefix| in |index_|. 106 IndexVector::const_iterator iter = 107 std::upper_bound(index_.begin(), index_.end(), 108 IndexPair(prefix, 0), PrefixLess); 109 110 // |prefix| comes before anything that's in the set. 111 if (iter == index_.begin()) 112 return false; 113 114 // Capture the upper bound of our target entry's deltas. 115 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second); 116 117 // Back up to the entry our target is in. 118 --iter; 119 120 // All prefixes in |index_| are in the set. 121 SBPrefix current = iter->first; 122 if (current == prefix) 123 return true; 124 125 // Scan forward accumulating deltas while a match is possible. 126 for (size_t di = iter->second; di < bound && current < prefix; ++di) { 127 current += deltas_[di]; 128 } 129 130 return current == prefix; 131 } 132 133 bool PrefixSet::Exists(const SBFullHash& hash) const { 134 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(), 135 hash, SBFullHashLess)) { 136 return true; 137 } 138 return PrefixExists(hash.prefix); 139 } 140 141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const { 142 prefixes->reserve(index_.size() + deltas_.size()); 143 144 for (size_t ii = 0; ii < index_.size(); ++ii) { 145 // The deltas for this |index_| entry run to the next index entry, 146 // or the end of the deltas. 147 const size_t deltas_end = 148 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size(); 149 150 SBPrefix current = index_[ii].first; 151 prefixes->push_back(current); 152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { 153 current += deltas_[di]; 154 prefixes->push_back(current); 155 } 156 } 157 } 158 159 // static 160 scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) { 161 int64 size_64; 162 if (!base::GetFileSize(filter_name, &size_64)) 163 return scoped_ptr<PrefixSet>(); 164 using base::MD5Digest; 165 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) 166 return scoped_ptr<PrefixSet>(); 167 168 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); 169 if (!file.get()) 170 return scoped_ptr<PrefixSet>(); 171 172 FileHeader header; 173 size_t read = fread(&header, sizeof(header), 1, file.get()); 174 if (read != 1) 175 return scoped_ptr<PrefixSet>(); 176 177 // The file looks valid, start building the digest. 178 base::MD5Context context; 179 base::MD5Init(&context); 180 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header), 181 sizeof(header))); 182 183 if (header.magic != kMagic) 184 return scoped_ptr<PrefixSet>(); 185 186 // Track version read to inform removal of support for older versions. 187 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version); 188 189 if (header.version <= kDeprecatedVersion) { 190 return scoped_ptr<PrefixSet>(); 191 } else if (header.version != kVersion) { 192 return scoped_ptr<PrefixSet>(); 193 } 194 195 IndexVector index; 196 const size_t index_bytes = sizeof(index[0]) * header.index_size; 197 198 std::vector<uint16> deltas; 199 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; 200 201 std::vector<SBFullHash> full_hashes; 202 const size_t full_hashes_bytes = 203 sizeof(full_hashes[0]) * header.full_hashes_size; 204 205 // Check for bogus sizes before allocating any space. 206 const size_t expected_bytes = sizeof(header) + 207 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); 208 if (static_cast<int64>(expected_bytes) != size_64) 209 return scoped_ptr<PrefixSet>(); 210 211 // Read the index vector. Herb Sutter indicates that vectors are 212 // guaranteed to be contiuguous, so reading to where element 0 lives 213 // is valid. 214 if (header.index_size) { 215 index.resize(header.index_size); 216 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); 217 if (read != index.size()) 218 return scoped_ptr<PrefixSet>(); 219 base::MD5Update(&context, 220 base::StringPiece(reinterpret_cast<char*>(&(index[0])), 221 index_bytes)); 222 } 223 224 // Read vector of deltas. 225 if (header.deltas_size) { 226 deltas.resize(header.deltas_size); 227 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get()); 228 if (read != deltas.size()) 229 return scoped_ptr<PrefixSet>(); 230 base::MD5Update(&context, 231 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])), 232 deltas_bytes)); 233 } 234 235 // Read vector of full hashes. 236 if (header.full_hashes_size) { 237 full_hashes.resize(header.full_hashes_size); 238 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(), 239 file.get()); 240 if (read != full_hashes.size()) 241 return scoped_ptr<PrefixSet>(); 242 base::MD5Update(&context, 243 base::StringPiece( 244 reinterpret_cast<char*>(&(full_hashes[0])), 245 full_hashes_bytes)); 246 } 247 248 base::MD5Digest calculated_digest; 249 base::MD5Final(&calculated_digest, &context); 250 251 base::MD5Digest file_digest; 252 read = fread(&file_digest, sizeof(file_digest), 1, file.get()); 253 if (read != 1) 254 return scoped_ptr<PrefixSet>(); 255 256 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) 257 return scoped_ptr<PrefixSet>(); 258 259 // Steals vector contents using swap(). 260 return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas, &full_hashes)); 261 } 262 263 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { 264 FileHeader header; 265 header.magic = kMagic; 266 header.version = kVersion; 267 header.index_size = static_cast<uint32>(index_.size()); 268 header.deltas_size = static_cast<uint32>(deltas_.size()); 269 header.full_hashes_size = static_cast<uint32>(full_hashes_.size()); 270 271 // Sanity check that the 32-bit values never mess things up. 272 if (static_cast<size_t>(header.index_size) != index_.size() || 273 static_cast<size_t>(header.deltas_size) != deltas_.size() || 274 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { 275 NOTREACHED(); 276 return false; 277 } 278 279 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); 280 if (!file.get()) 281 return false; 282 283 base::MD5Context context; 284 base::MD5Init(&context); 285 286 // TODO(shess): The I/O code in safe_browsing_store_file.cc would 287 // sure be useful about now. 288 size_t written = fwrite(&header, sizeof(header), 1, file.get()); 289 if (written != 1) 290 return false; 291 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header), 292 sizeof(header))); 293 294 // As for reads, the standard guarantees the ability to access the 295 // contents of the vector by a pointer to an element. 296 if (index_.size()) { 297 const size_t index_bytes = sizeof(index_[0]) * index_.size(); 298 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), 299 file.get()); 300 if (written != index_.size()) 301 return false; 302 base::MD5Update(&context, 303 base::StringPiece( 304 reinterpret_cast<const char*>(&(index_[0])), 305 index_bytes)); 306 } 307 308 if (deltas_.size()) { 309 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size(); 310 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(), 311 file.get()); 312 if (written != deltas_.size()) 313 return false; 314 base::MD5Update(&context, 315 base::StringPiece( 316 reinterpret_cast<const char*>(&(deltas_[0])), 317 deltas_bytes)); 318 } 319 320 if (full_hashes_.size()) { 321 const size_t elt_size = sizeof(full_hashes_[0]); 322 const size_t elts = full_hashes_.size(); 323 const size_t full_hashes_bytes = elt_size * elts; 324 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get()); 325 if (written != elts) 326 return false; 327 base::MD5Update(&context, 328 base::StringPiece( 329 reinterpret_cast<const char*>(&(full_hashes_[0])), 330 full_hashes_bytes)); 331 } 332 333 base::MD5Digest digest; 334 base::MD5Final(&digest, &context); 335 written = fwrite(&digest, sizeof(digest), 1, file.get()); 336 if (written != 1) 337 return false; 338 339 // TODO(shess): Can this code check that the close was successful? 340 file.reset(); 341 342 return true; 343 } 344 345 void PrefixSet::AddRun(SBPrefix index_prefix, 346 const uint16* run_begin, const uint16* run_end) { 347 // Preempt organic capacity decisions for |delta_| once strong estimates can 348 // be made. 349 if (index_prefix > kEstimateThreshold && 350 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { 351 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); 352 } 353 354 index_.push_back(std::make_pair(index_prefix, deltas_.size())); 355 deltas_.insert(deltas_.end(), run_begin, run_end); 356 } 357 358 PrefixSetBuilder::PrefixSetBuilder() 359 : prefix_set_(new PrefixSet()) { 360 } 361 362 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) 363 : prefix_set_(new PrefixSet()) { 364 for (size_t i = 0; i < prefixes.size(); ++i) { 365 AddPrefix(prefixes[i]); 366 } 367 } 368 369 PrefixSetBuilder::~PrefixSetBuilder() { 370 } 371 372 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSet( 373 const std::vector<SBFullHash>& hashes) { 374 DCHECK(prefix_set_.get()); 375 376 // Flush runs until buffered data is gone. 377 while (!buffer_.empty()) { 378 EmitRun(); 379 } 380 381 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but 382 // they're almost free. 383 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_); 384 385 prefix_set_->full_hashes_ = hashes; 386 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(), 387 SBFullHashLess); 388 389 return prefix_set_.Pass(); 390 } 391 392 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { 393 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); 394 } 395 396 void PrefixSetBuilder::EmitRun() { 397 DCHECK(prefix_set_.get()); 398 399 SBPrefix prev_prefix = buffer_[0]; 400 uint16 run[PrefixSet::kMaxRun]; 401 size_t run_pos = 0; 402 403 size_t i; 404 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { 405 // Calculate the delta. |unsigned| is mandatory, because the 406 // sorted_prefixes could be more than INT_MAX apart. 407 DCHECK_GT(buffer_[i], prev_prefix); 408 const unsigned delta = buffer_[i] - prev_prefix; 409 const uint16 delta16 = static_cast<uint16>(delta); 410 411 // Break the run if the delta doesn't fit. 412 if (delta != static_cast<unsigned>(delta16)) 413 break; 414 415 // Continue the run of deltas. 416 run[run_pos++] = delta16; 417 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); 418 419 prev_prefix = buffer_[i]; 420 } 421 prefix_set_->AddRun(buffer_[0], run, run + run_pos); 422 buffer_.erase(buffer_.begin(), buffer_.begin() + i); 423 } 424 425 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) { 426 DCHECK(prefix_set_.get()); 427 428 if (buffer_.empty()) { 429 DCHECK(prefix_set_->index_.empty()); 430 DCHECK(prefix_set_->deltas_.empty()); 431 } else { 432 // Drop duplicates. 433 if (buffer_.back() == prefix) 434 return; 435 436 DCHECK_LT(buffer_.back(), prefix); 437 } 438 buffer_.push_back(prefix); 439 440 // Flush buffer when a run can be constructed. +1 for the index item, and +1 441 // to leave at least one item in the buffer for dropping duplicates. 442 if (buffer_.size() > PrefixSet::kMaxRun + 2) 443 EmitRun(); 444 } 445 446 } // namespace safe_browsing 447