1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // This test performs a series false positive checks using a list of URLs 6 // against a known set of SafeBrowsing data. 7 // 8 // It uses a normal SafeBrowsing database to create a bloom filter where it 9 // looks up all the URLs in the url file. A URL that has a prefix found in the 10 // bloom filter and found in the database is considered a hit: a valid lookup 11 // that will result in a gethash request. A URL that has a prefix found in the 12 // bloom filter but not in the database is a miss: a false positive lookup that 13 // will result in an unnecessary gethash request. 14 // 15 // By varying the size of the bloom filter and using a constant set of 16 // SafeBrowsing data, we can check a known set of URLs against the filter and 17 // determine the false positive rate. 18 // 19 // False positive calculation usage: 20 // $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives 21 // --filter-start=<integer> 22 // --filter-steps=<integer> 23 // --filter-verbose 24 // 25 // --filter-start: The filter multiplier to begin with. This represents the 26 // number of bits per prefix of memory to use in the filter. 27 // The default value is identical to the current SafeBrowsing 28 // database value. 29 // --filter-steps: The number of iterations to run, with each iteration 30 // increasing the filter multiplier by 1. The default value 31 // is 1. 32 // --filter-verbose: Used to print out the hit / miss results per URL. 33 // --filter-csv: The URL file contains information about the number of 34 // unique views (the popularity) of each URL. See the format 35 // description below. 36 // 37 // 38 // Hash compute time usage: 39 // $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime 40 // --filter-num-checks=<integer> 41 // 42 // --filter-num-checks: The number of hash look ups to perform on the bloom 43 // filter. The default is 10 million. 44 // 45 // Data files: 46 // chrome/test/data/safe_browsing/filter/database 47 // chrome/test/data/safe_browsing/filter/urls 48 // 49 // database: A normal SafeBrowsing database. 50 // urls: A text file containing a list of URLs, one per line. If the option 51 // --filter-csv is specified, the format of each line in the file is 52 // <url>,<weight> where weight is an integer indicating the number of 53 // unique views for the URL. 54 55 #include <algorithm> 56 #include <fstream> 57 #include <vector> 58 59 #include "app/sql/statement.h" 60 #include "base/command_line.h" 61 #include "base/file_path.h" 62 #include "base/file_util.h" 63 #include "base/logging.h" 64 #include "base/memory/scoped_ptr.h" 65 #include "base/path_service.h" 66 #include "base/rand_util.h" 67 #include "base/string_number_conversions.h" 68 #include "base/string_util.h" 69 #include "base/time.h" 70 #include "crypto/sha2.h" 71 #include "chrome/browser/safe_browsing/bloom_filter.h" 72 #include "chrome/browser/safe_browsing/safe_browsing_util.h" 73 #include "chrome/common/chrome_paths.h" 74 #include "googleurl/src/gurl.h" 75 #include "testing/gtest/include/gtest/gtest.h" 76 77 using base::Time; 78 using base::TimeDelta; 79 80 namespace { 81 82 // Command line flags. 83 const char kFilterVerbose[] = "filter-verbose"; 84 const char kFilterStart[] = "filter-start"; 85 const char kFilterSteps[] = "filter-steps"; 86 const char kFilterCsv[] = "filter-csv"; 87 const char kFilterNumChecks[] = "filter-num-checks"; 88 89 // Number of hash checks to make during performance testing. 90 const int kNumHashChecks = 10000000; 91 92 // Returns the path to the data used in this test, relative to the top of the 93 // source directory. 94 FilePath GetFullDataPath() { 95 FilePath full_path; 96 CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path)); 97 full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing")); 98 full_path = full_path.Append(FILE_PATH_LITERAL("filter")); 99 CHECK(file_util::PathExists(full_path)); 100 return full_path; 101 } 102 103 // Constructs a bloom filter of the appropriate size from the provided prefixes. 104 void BuildBloomFilter(int size_multiplier, 105 const std::vector<SBPrefix>& prefixes, 106 BloomFilter** bloom_filter) { 107 // Create a BloomFilter with the specified size. 108 const int key_count = std::max(static_cast<int>(prefixes.size()), 109 BloomFilter::kBloomFilterMinSize); 110 const int filter_size = key_count * size_multiplier; 111 *bloom_filter = new BloomFilter(filter_size); 112 113 // Add the prefixes to it. 114 for (size_t i = 0; i < prefixes.size(); ++i) 115 (*bloom_filter)->Insert(prefixes[i]); 116 117 std::cout << "Bloom filter with prefixes: " << prefixes.size() 118 << ", multiplier: " << size_multiplier 119 << ", size (bytes): " << (*bloom_filter)->size() 120 << std::endl; 121 } 122 123 // Reads the set of add prefixes contained in a SafeBrowsing database into a 124 // sorted array suitable for fast searching. This takes significantly less time 125 // to look up a given prefix than performing SQL queries. 126 bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) { 127 FilePath database_file = path.Append(FILE_PATH_LITERAL("database")); 128 sql::Connection db; 129 if (!db.Open(database_file)) 130 return false; 131 132 // Get the number of items in the add_prefix table. 133 const char* query = "SELECT COUNT(*) FROM add_prefix"; 134 sql::Statement count_statement(db.GetCachedStatement(SQL_FROM_HERE, query)); 135 if (!count_statement) 136 return false; 137 138 if (!count_statement.Step()) 139 return false; 140 141 const int count = count_statement.ColumnInt(0); 142 143 // Load them into a prefix vector and sort. 144 prefixes->reserve(count); 145 query = "SELECT prefix FROM add_prefix"; 146 sql::Statement prefix_statement(db.GetCachedStatement(SQL_FROM_HERE, query)); 147 if (!prefix_statement) 148 return false; 149 150 while (prefix_statement.Step()) 151 prefixes->push_back(prefix_statement.ColumnInt(0)); 152 153 DCHECK(static_cast<int>(prefixes->size()) == count); 154 sort(prefixes->begin(), prefixes->end()); 155 156 return true; 157 } 158 159 // Generates all legal SafeBrowsing prefixes for the specified URL, and returns 160 // the set of Prefixes that exist in the bloom filter. The function returns the 161 // number of host + path combinations checked. 162 int GeneratePrefixHits(const std::string url, 163 BloomFilter* bloom_filter, 164 std::vector<SBPrefix>* prefixes) { 165 GURL url_check(url); 166 std::vector<std::string> hosts; 167 if (url_check.HostIsIPAddress()) { 168 hosts.push_back(url_check.host()); 169 } else { 170 safe_browsing_util::GenerateHostsToCheck(url_check, &hosts); 171 } 172 173 std::vector<std::string> paths; 174 safe_browsing_util::GeneratePathsToCheck(url_check, &paths); 175 176 for (size_t i = 0; i < hosts.size(); ++i) { 177 for (size_t j = 0; j < paths.size(); ++j) { 178 SBPrefix prefix; 179 crypto::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix)); 180 if (bloom_filter->Exists(prefix)) 181 prefixes->push_back(prefix); 182 } 183 } 184 185 return hosts.size() * paths.size(); 186 } 187 188 // Binary search of sorted prefixes. 189 bool IsPrefixInDatabase(SBPrefix prefix, 190 const std::vector<SBPrefix>& prefixes) { 191 if (prefixes.empty()) 192 return false; 193 194 int low = 0; 195 int high = prefixes.size() - 1; 196 while (low <= high) { 197 int mid = ((unsigned int)low + (unsigned int)high) >> 1; 198 SBPrefix prefix_mid = prefixes[mid]; 199 if (prefix_mid == prefix) 200 return true; 201 if (prefix_mid < prefix) 202 low = mid + 1; 203 else 204 high = mid - 1; 205 } 206 207 return false; 208 } 209 210 // Construct a bloom filter with the given prefixes and multiplier, and test the 211 // false positive rate (misses) against a URL list. 212 void CalculateBloomFilterFalsePositives( 213 int size_multiplier, 214 const FilePath& data_dir, 215 const std::vector<SBPrefix>& prefix_list) { 216 BloomFilter* bloom_filter = NULL; 217 BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter); 218 scoped_refptr<BloomFilter> scoped_filter(bloom_filter); 219 220 // Read in data file line at a time. 221 FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls")); 222 std::ifstream url_stream(url_file.value().c_str()); 223 224 // Keep track of stats 225 int hits = 0; 226 int misses = 0; 227 int weighted_hits = 0; 228 int weighted_misses = 0; 229 int url_count = 0; 230 int prefix_count = 0; 231 232 // Print out volumes of data (per URL hit and miss information). 233 bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose); 234 bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv); 235 236 std::string url; 237 while (std::getline(url_stream, url)) { 238 ++url_count; 239 240 // Handle a format that contains URLs weighted by unique views. 241 int weight = 1; 242 if (use_weights) { 243 std::string::size_type pos = url.find_last_of(","); 244 if (pos != std::string::npos) { 245 base::StringToInt(url.begin() + pos + 1, url.end(), &weight); 246 url = url.substr(0, pos); 247 } 248 } 249 250 // See if the URL is in the bloom filter. 251 std::vector<SBPrefix> prefixes; 252 prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes); 253 254 // See if the prefix is actually in the database (in-memory prefix list). 255 for (size_t i = 0; i < prefixes.size(); ++i) { 256 if (IsPrefixInDatabase(prefixes[i], prefix_list)) { 257 ++hits; 258 weighted_hits += weight; 259 if (verbose) { 260 std::cout << "Hit for URL: " << url 261 << " (prefix = " << prefixes[i] << ")" 262 << std::endl; 263 } 264 } else { 265 ++misses; 266 weighted_misses += weight; 267 if (verbose) { 268 std::cout << "Miss for URL: " << url 269 << " (prefix = " << prefixes[i] << ")" 270 << std::endl; 271 } 272 } 273 } 274 } 275 276 // Print out the results for this test. 277 std::cout << "URLs checked: " << url_count 278 << ", prefix compares: " << prefix_count 279 << ", hits: " << hits 280 << ", misses: " << misses; 281 if (use_weights) { 282 std::cout << ", weighted hits: " << weighted_hits 283 << ", weighted misses: " << weighted_misses; 284 } 285 std::cout << std::endl; 286 } 287 288 } // namespace 289 290 // This test can take several minutes to perform its calculations, so it should 291 // be disabled until you need to run it. 292 TEST(SafeBrowsingBloomFilter, FalsePositives) { 293 std::vector<SBPrefix> prefix_list; 294 FilePath data_dir = GetFullDataPath(); 295 ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list)); 296 297 const CommandLine& cmd_line = *CommandLine::ForCurrentProcess(); 298 299 int start = BloomFilter::kBloomFilterSizeRatio; 300 if (cmd_line.HasSwitch(kFilterStart)) { 301 ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart), 302 &start)); 303 } 304 305 int steps = 1; 306 if (cmd_line.HasSwitch(kFilterSteps)) { 307 ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps), 308 &steps)); 309 } 310 311 int stop = start + steps; 312 313 for (int multiplier = start; multiplier < stop; ++multiplier) 314 CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list); 315 } 316 317 // Computes the time required for performing a number of look ups in a bloom 318 // filter. This is useful for measuring the performance of new hash functions. 319 TEST(SafeBrowsingBloomFilter, HashTime) { 320 // Read the data from the database. 321 std::vector<SBPrefix> prefix_list; 322 FilePath data_dir = GetFullDataPath(); 323 ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list)); 324 325 const CommandLine& cmd_line = *CommandLine::ForCurrentProcess(); 326 327 int num_checks = kNumHashChecks; 328 if (cmd_line.HasSwitch(kFilterNumChecks)) { 329 ASSERT_TRUE( 330 base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks), 331 &num_checks)); 332 } 333 334 // Populate the bloom filter and measure the time. 335 BloomFilter* bloom_filter = NULL; 336 Time populate_before = Time::Now(); 337 BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio, 338 prefix_list, &bloom_filter); 339 TimeDelta populate = Time::Now() - populate_before; 340 341 // Check a large number of random prefixes against the filter. 342 int hits = 0; 343 Time check_before = Time::Now(); 344 for (int i = 0; i < num_checks; ++i) { 345 uint32 prefix = static_cast<uint32>(base::RandUint64()); 346 if (bloom_filter->Exists(prefix)) 347 ++hits; 348 } 349 TimeDelta check = Time::Now() - check_before; 350 351 int64 time_per_insert = populate.InMicroseconds() / 352 static_cast<int>(prefix_list.size()); 353 int64 time_per_check = check.InMicroseconds() / num_checks; 354 355 std::cout << "Time results for checks: " << num_checks 356 << ", prefixes: " << prefix_list.size() 357 << ", populate time (ms): " << populate.InMilliseconds() 358 << ", check time (ms): " << check.InMilliseconds() 359 << ", hits: " << hits 360 << ", per-populate (us): " << time_per_insert 361 << ", per-check (us): " << time_per_check 362 << std::endl; 363 } 364