1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/prefix_set.h" 6 7 #include <algorithm> 8 #include <math.h> 9 10 #include "base/file_util.h" 11 #include "base/logging.h" 12 #include "base/md5.h" 13 #include "base/metrics/histogram.h" 14 15 namespace { 16 17 // |kMagic| should be reasonably unique, and not match itself across 18 // endianness changes. I generated this value with: 19 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 20 static uint32 kMagic = 0x864088dd; 21 22 // Current version the code writes out. 23 static uint32 kVersion = 0x1; 24 25 typedef struct { 26 uint32 magic; 27 uint32 version; 28 uint32 index_size; 29 uint32 deltas_size; 30 } FileHeader; 31 32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. 33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a, 34 const std::pair<SBPrefix,size_t>& b) { 35 return a.first < b.first; 36 } 37 38 } // namespace 39 40 namespace safe_browsing { 41 42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) { 43 if (sorted_prefixes.size()) { 44 // Lead with the first prefix. 45 SBPrefix prev_prefix = sorted_prefixes[0]; 46 size_t run_length = 0; 47 index_.push_back(std::make_pair(prev_prefix, deltas_.size())); 48 49 // Used to build a checksum from the data used to construct the 50 // structures. Since the data is a bunch of uniform hashes, it 51 // seems reasonable to just xor most of it in, rather than trying 52 // to use a more complicated algorithm. 53 uint32 checksum = static_cast<uint32>(sorted_prefixes[0]); 54 checksum ^= static_cast<uint32>(deltas_.size()); 55 56 for (size_t i = 1; i < sorted_prefixes.size(); ++i) { 57 // Skip duplicates. 58 if (sorted_prefixes[i] == prev_prefix) 59 continue; 60 61 // Calculate the delta. |unsigned| is mandatory, because the 62 // sorted_prefixes could be more than INT_MAX apart. 63 DCHECK_GT(sorted_prefixes[i], prev_prefix); 64 const unsigned delta = sorted_prefixes[i] - prev_prefix; 65 const uint16 delta16 = static_cast<uint16>(delta); 66 67 // New index ref if the delta doesn't fit, or if too many 68 // consecutive deltas have been encoded. 69 if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) { 70 checksum ^= static_cast<uint32>(sorted_prefixes[i]); 71 checksum ^= static_cast<uint32>(deltas_.size()); 72 index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size())); 73 run_length = 0; 74 } else { 75 checksum ^= static_cast<uint32>(delta16); 76 // Continue the run of deltas. 77 deltas_.push_back(delta16); 78 DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta); 79 ++run_length; 80 } 81 82 prev_prefix = sorted_prefixes[i]; 83 } 84 checksum_ = checksum; 85 DCHECK(CheckChecksum()); 86 87 // Send up some memory-usage stats. Bits because fractional bytes 88 // are weird. 89 const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT + 90 deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT; 91 const size_t unique_prefixes = index_.size() + deltas_.size(); 92 static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT; 93 UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix", 94 bits_used / unique_prefixes, 95 kMaxBitsPerPrefix); 96 } 97 } 98 99 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index, 100 std::vector<uint16> *deltas) { 101 DCHECK(index && deltas); 102 index_.swap(*index); 103 deltas_.swap(*deltas); 104 } 105 106 PrefixSet::~PrefixSet() {} 107 108 bool PrefixSet::Exists(SBPrefix prefix) const { 109 if (index_.empty()) 110 return false; 111 112 // Find the first position after |prefix| in |index_|. 113 std::vector<std::pair<SBPrefix,size_t> >::const_iterator 114 iter = std::upper_bound(index_.begin(), index_.end(), 115 std::pair<SBPrefix,size_t>(prefix, 0), 116 PrefixLess); 117 118 // |prefix| comes before anything that's in the set. 119 if (iter == index_.begin()) 120 return false; 121 122 // Capture the upper bound of our target entry's deltas. 123 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second); 124 125 // Back up to the entry our target is in. 126 --iter; 127 128 // All prefixes in |index_| are in the set. 129 SBPrefix current = iter->first; 130 if (current == prefix) 131 return true; 132 133 // Scan forward accumulating deltas while a match is possible. 134 for (size_t di = iter->second; di < bound && current < prefix; ++di) { 135 current += deltas_[di]; 136 } 137 138 return current == prefix; 139 } 140 141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const { 142 for (size_t ii = 0; ii < index_.size(); ++ii) { 143 // The deltas for this |index_| entry run to the next index entry, 144 // or the end of the deltas. 145 const size_t deltas_end = 146 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size(); 147 148 SBPrefix current = index_[ii].first; 149 prefixes->push_back(current); 150 for (size_t di = index_[ii].second; di < deltas_end; ++di) { 151 current += deltas_[di]; 152 prefixes->push_back(current); 153 } 154 } 155 } 156 157 // static 158 PrefixSet* PrefixSet::LoadFile(const FilePath& filter_name) { 159 int64 size_64; 160 if (!file_util::GetFileSize(filter_name, &size_64)) 161 return NULL; 162 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) 163 return NULL; 164 165 file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb")); 166 if (!file.get()) 167 return NULL; 168 169 FileHeader header; 170 size_t read = fread(&header, sizeof(header), 1, file.get()); 171 if (read != 1) 172 return NULL; 173 174 if (header.magic != kMagic || header.version != kVersion) 175 return NULL; 176 177 std::vector<std::pair<SBPrefix,size_t> > index; 178 const size_t index_bytes = sizeof(index[0]) * header.index_size; 179 180 std::vector<uint16> deltas; 181 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; 182 183 // Check for bogus sizes before allocating any space. 184 const size_t expected_bytes = 185 sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest); 186 if (static_cast<int64>(expected_bytes) != size_64) 187 return NULL; 188 189 // The file looks valid, start building the digest. 190 MD5Context context; 191 MD5Init(&context); 192 MD5Update(&context, &header, sizeof(header)); 193 194 // Read the index vector. Herb Sutter indicates that vectors are 195 // guaranteed to be contiuguous, so reading to where element 0 lives 196 // is valid. 197 index.resize(header.index_size); 198 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); 199 if (read != index.size()) 200 return NULL; 201 MD5Update(&context, &(index[0]), index_bytes); 202 203 // Read vector of deltas. 204 deltas.resize(header.deltas_size); 205 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get()); 206 if (read != deltas.size()) 207 return NULL; 208 MD5Update(&context, &(deltas[0]), deltas_bytes); 209 210 MD5Digest calculated_digest; 211 MD5Final(&calculated_digest, &context); 212 213 MD5Digest file_digest; 214 read = fread(&file_digest, sizeof(file_digest), 1, file.get()); 215 if (read != 1) 216 return NULL; 217 218 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) 219 return NULL; 220 221 // Steals contents of |index| and |deltas| via swap(). 222 return new PrefixSet(&index, &deltas); 223 } 224 225 bool PrefixSet::WriteFile(const FilePath& filter_name) const { 226 FileHeader header; 227 header.magic = kMagic; 228 header.version = kVersion; 229 header.index_size = static_cast<uint32>(index_.size()); 230 header.deltas_size = static_cast<uint32>(deltas_.size()); 231 232 // Sanity check that the 32-bit values never mess things up. 233 if (static_cast<size_t>(header.index_size) != index_.size() || 234 static_cast<size_t>(header.deltas_size) != deltas_.size()) { 235 NOTREACHED(); 236 return false; 237 } 238 239 file_util::ScopedFILE file(file_util::OpenFile(filter_name, "wb")); 240 if (!file.get()) 241 return false; 242 243 MD5Context context; 244 MD5Init(&context); 245 246 // TODO(shess): The I/O code in safe_browsing_store_file.cc would 247 // sure be useful about now. 248 size_t written = fwrite(&header, sizeof(header), 1, file.get()); 249 if (written != 1) 250 return false; 251 MD5Update(&context, &header, sizeof(header)); 252 253 // As for reads, the standard guarantees the ability to access the 254 // contents of the vector by a pointer to an element. 255 const size_t index_bytes = sizeof(index_[0]) * index_.size(); 256 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), file.get()); 257 if (written != index_.size()) 258 return false; 259 MD5Update(&context, &(index_[0]), index_bytes); 260 261 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size(); 262 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(), 263 file.get()); 264 if (written != deltas_.size()) 265 return false; 266 MD5Update(&context, &(deltas_[0]), deltas_bytes); 267 268 MD5Digest digest; 269 MD5Final(&digest, &context); 270 written = fwrite(&digest, sizeof(digest), 1, file.get()); 271 if (written != 1) 272 return false; 273 274 // TODO(shess): Can this code check that the close was successful? 275 file.reset(); 276 277 return true; 278 } 279 280 size_t PrefixSet::IndexBinFor(size_t target_index) const { 281 // The items in |index_| have the logical index of each previous 282 // item in |index_| plus the count of deltas between the items. 283 // Since the indices into |deltas_| are absolute, the logical index 284 // is then the sum of the two indices. 285 size_t lo = 0; 286 size_t hi = index_.size(); 287 288 // Binary search because linear search was too slow (really, the 289 // unit test sucked). Inline because the elements can't be compared 290 // in isolation (their position matters). 291 while (hi - lo > 1) { 292 const size_t i = (lo + hi) / 2; 293 294 if (target_index < i + index_[i].second) { 295 DCHECK_LT(i, hi); // Always making progress. 296 hi = i; 297 } else { 298 DCHECK_GT(i, lo); // Always making progress. 299 lo = i; 300 } 301 } 302 return lo; 303 } 304 305 size_t PrefixSet::GetSize() const { 306 return index_.size() + deltas_.size(); 307 } 308 309 bool PrefixSet::IsDeltaAt(size_t target_index) const { 310 CHECK_LT(target_index, GetSize()); 311 312 const size_t i = IndexBinFor(target_index); 313 return target_index > i + index_[i].second; 314 } 315 316 uint16 PrefixSet::DeltaAt(size_t target_index) const { 317 CHECK_LT(target_index, GetSize()); 318 319 // Find the |index_| entry which contains |target_index|. 320 const size_t i = IndexBinFor(target_index); 321 322 // Exactly on the |index_| entry means no delta. 323 CHECK_GT(target_index, i + index_[i].second); 324 325 // -i backs out the |index_| entries, -1 gets the delta that lead to 326 // the value at |target_index|. 327 CHECK_LT(target_index - i - 1, deltas_.size()); 328 return deltas_[target_index - i - 1]; 329 } 330 331 bool PrefixSet::CheckChecksum() const { 332 uint32 checksum = 0; 333 334 for (size_t ii = 0; ii < index_.size(); ++ii) { 335 checksum ^= static_cast<uint32>(index_[ii].first); 336 checksum ^= static_cast<uint32>(index_[ii].second); 337 } 338 339 for (size_t di = 0; di < deltas_.size(); ++di) { 340 checksum ^= static_cast<uint32>(deltas_[di]); 341 } 342 343 return checksum == checksum_; 344 } 345 346 } // namespace safe_browsing 347