Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // This test performs a series false positive checks using a list of URLs
      6 // against a known set of SafeBrowsing data.
      7 //
      8 // It uses a normal SafeBrowsing database to create a bloom filter where it
      9 // looks up all the URLs in the url file. A URL that has a prefix found in the
     10 // bloom filter and found in the database is considered a hit: a valid lookup
     11 // that will result in a gethash request. A URL that has a prefix found in the
     12 // bloom filter but not in the database is a miss: a false positive lookup that
     13 // will result in an unnecessary gethash request.
     14 //
     15 // By varying the size of the bloom filter and using a constant set of
     16 // SafeBrowsing data, we can check a known set of URLs against the filter and
     17 // determine the false positive rate.
     18 //
     19 // False positive calculation usage:
     20 //   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives
     21 //                      --filter-start=<integer>
     22 //                      --filter-steps=<integer>
     23 //                      --filter-verbose
     24 //
     25 //  --filter-start:   The filter multiplier to begin with. This represents the
     26 //                    number of bits per prefix of memory to use in the filter.
     27 //                    The default value is identical to the current SafeBrowsing
     28 //                    database value.
     29 //  --filter-steps:   The number of iterations to run, with each iteration
     30 //                    increasing the filter multiplier by 1. The default value
     31 //                    is 1.
     32 //  --filter-verbose: Used to print out the hit / miss results per URL.
     33 //  --filter-csv:     The URL file contains information about the number of
     34 //                    unique views (the popularity) of each URL. See the format
     35 //                    description below.
     36 //
     37 //
     38 // Hash compute time usage:
     39 //   $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime
     40 //                      --filter-num-checks=<integer>
     41 //
     42 //  --filter-num-checks: The number of hash look ups to perform on the bloom
     43 //                       filter. The default is 10 million.
     44 //
     45 // Data files:
     46 //    chrome/test/data/safe_browsing/filter/database
     47 //    chrome/test/data/safe_browsing/filter/urls
     48 //
     49 // database: A normal SafeBrowsing database.
     50 // urls:     A text file containing a list of URLs, one per line. If the option
     51 //           --filter-csv is specified, the format of each line in the file is
     52 //           <url>,<weight> where weight is an integer indicating the number of
     53 //           unique views for the URL.
     54 
     55 #include <algorithm>
     56 #include <fstream>
     57 #include <vector>
     58 
     59 #include "app/sql/statement.h"
     60 #include "base/command_line.h"
     61 #include "base/file_path.h"
     62 #include "base/file_util.h"
     63 #include "base/logging.h"
     64 #include "base/memory/scoped_ptr.h"
     65 #include "base/path_service.h"
     66 #include "base/rand_util.h"
     67 #include "base/string_number_conversions.h"
     68 #include "base/string_util.h"
     69 #include "base/time.h"
     70 #include "crypto/sha2.h"
     71 #include "chrome/browser/safe_browsing/bloom_filter.h"
     72 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
     73 #include "chrome/common/chrome_paths.h"
     74 #include "googleurl/src/gurl.h"
     75 #include "testing/gtest/include/gtest/gtest.h"
     76 
     77 using base::Time;
     78 using base::TimeDelta;
     79 
     80 namespace {
     81 
     82 // Command line flags.
     83 const char kFilterVerbose[] = "filter-verbose";
     84 const char kFilterStart[] = "filter-start";
     85 const char kFilterSteps[] = "filter-steps";
     86 const char kFilterCsv[] = "filter-csv";
     87 const char kFilterNumChecks[] = "filter-num-checks";
     88 
     89 // Number of hash checks to make during performance testing.
     90 const int kNumHashChecks = 10000000;
     91 
     92 // Returns the path to the data used in this test, relative to the top of the
     93 // source directory.
     94 FilePath GetFullDataPath() {
     95   FilePath full_path;
     96   CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path));
     97   full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing"));
     98   full_path = full_path.Append(FILE_PATH_LITERAL("filter"));
     99   CHECK(file_util::PathExists(full_path));
    100   return full_path;
    101 }
    102 
    103 // Constructs a bloom filter of the appropriate size from the provided prefixes.
    104 void BuildBloomFilter(int size_multiplier,
    105                       const std::vector<SBPrefix>& prefixes,
    106                       BloomFilter** bloom_filter) {
    107   // Create a BloomFilter with the specified size.
    108   const int key_count = std::max(static_cast<int>(prefixes.size()),
    109                                  BloomFilter::kBloomFilterMinSize);
    110   const int filter_size = key_count * size_multiplier;
    111   *bloom_filter = new BloomFilter(filter_size);
    112 
    113   // Add the prefixes to it.
    114   for (size_t i = 0; i < prefixes.size(); ++i)
    115     (*bloom_filter)->Insert(prefixes[i]);
    116 
    117   std::cout << "Bloom filter with prefixes: " << prefixes.size()
    118             << ", multiplier: " << size_multiplier
    119             << ", size (bytes): " << (*bloom_filter)->size()
    120             << std::endl;
    121 }
    122 
    123 // Reads the set of add prefixes contained in a SafeBrowsing database into a
    124 // sorted array suitable for fast searching. This takes significantly less time
    125 // to look up a given prefix than performing SQL queries.
    126 bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) {
    127   FilePath database_file = path.Append(FILE_PATH_LITERAL("database"));
    128   sql::Connection db;
    129   if (!db.Open(database_file))
    130     return false;
    131 
    132   // Get the number of items in the add_prefix table.
    133   const char* query = "SELECT COUNT(*) FROM add_prefix";
    134   sql::Statement count_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
    135   if (!count_statement)
    136     return false;
    137 
    138   if (!count_statement.Step())
    139     return false;
    140 
    141   const int count = count_statement.ColumnInt(0);
    142 
    143   // Load them into a prefix vector and sort.
    144   prefixes->reserve(count);
    145   query = "SELECT prefix FROM add_prefix";
    146   sql::Statement prefix_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
    147   if (!prefix_statement)
    148     return false;
    149 
    150   while (prefix_statement.Step())
    151     prefixes->push_back(prefix_statement.ColumnInt(0));
    152 
    153   DCHECK(static_cast<int>(prefixes->size()) == count);
    154   sort(prefixes->begin(), prefixes->end());
    155 
    156   return true;
    157 }
    158 
    159 // Generates all legal SafeBrowsing prefixes for the specified URL, and returns
    160 // the set of Prefixes that exist in the bloom filter. The function returns the
    161 // number of host + path combinations checked.
    162 int GeneratePrefixHits(const std::string url,
    163                        BloomFilter* bloom_filter,
    164                        std::vector<SBPrefix>* prefixes) {
    165   GURL url_check(url);
    166   std::vector<std::string> hosts;
    167   if (url_check.HostIsIPAddress()) {
    168     hosts.push_back(url_check.host());
    169   } else {
    170     safe_browsing_util::GenerateHostsToCheck(url_check, &hosts);
    171   }
    172 
    173   std::vector<std::string> paths;
    174   safe_browsing_util::GeneratePathsToCheck(url_check, &paths);
    175 
    176   for (size_t i = 0; i < hosts.size(); ++i) {
    177     for (size_t j = 0; j < paths.size(); ++j) {
    178       SBPrefix prefix;
    179       crypto::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix));
    180       if (bloom_filter->Exists(prefix))
    181         prefixes->push_back(prefix);
    182     }
    183   }
    184 
    185   return hosts.size() * paths.size();
    186 }
    187 
    188 // Binary search of sorted prefixes.
    189 bool IsPrefixInDatabase(SBPrefix prefix,
    190                         const std::vector<SBPrefix>& prefixes) {
    191   if (prefixes.empty())
    192     return false;
    193 
    194   int low = 0;
    195   int high = prefixes.size() - 1;
    196   while (low <= high) {
    197     int mid = ((unsigned int)low + (unsigned int)high) >> 1;
    198     SBPrefix prefix_mid = prefixes[mid];
    199     if (prefix_mid == prefix)
    200       return true;
    201     if (prefix_mid < prefix)
    202       low = mid + 1;
    203     else
    204       high = mid - 1;
    205   }
    206 
    207   return false;
    208 }
    209 
    210 // Construct a bloom filter with the given prefixes and multiplier, and test the
    211 // false positive rate (misses) against a URL list.
    212 void CalculateBloomFilterFalsePositives(
    213     int size_multiplier,
    214     const FilePath& data_dir,
    215     const std::vector<SBPrefix>& prefix_list) {
    216   BloomFilter* bloom_filter = NULL;
    217   BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter);
    218   scoped_refptr<BloomFilter> scoped_filter(bloom_filter);
    219 
    220   // Read in data file line at a time.
    221   FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls"));
    222   std::ifstream url_stream(url_file.value().c_str());
    223 
    224   // Keep track of stats
    225   int hits = 0;
    226   int misses = 0;
    227   int weighted_hits = 0;
    228   int weighted_misses = 0;
    229   int url_count = 0;
    230   int prefix_count = 0;
    231 
    232   // Print out volumes of data (per URL hit and miss information).
    233   bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose);
    234   bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv);
    235 
    236   std::string url;
    237   while (std::getline(url_stream, url)) {
    238     ++url_count;
    239 
    240     // Handle a format that contains URLs weighted by unique views.
    241     int weight = 1;
    242     if (use_weights) {
    243       std::string::size_type pos = url.find_last_of(",");
    244       if (pos != std::string::npos) {
    245         base::StringToInt(url.begin() + pos + 1, url.end(), &weight);
    246         url = url.substr(0, pos);
    247       }
    248     }
    249 
    250     // See if the URL is in the bloom filter.
    251     std::vector<SBPrefix> prefixes;
    252     prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes);
    253 
    254     // See if the prefix is actually in the database (in-memory prefix list).
    255     for (size_t i = 0; i < prefixes.size(); ++i) {
    256       if (IsPrefixInDatabase(prefixes[i], prefix_list)) {
    257         ++hits;
    258         weighted_hits += weight;
    259         if (verbose) {
    260           std::cout << "Hit for URL: " << url
    261                     << " (prefix = "   << prefixes[i] << ")"
    262                     << std::endl;
    263         }
    264       } else {
    265         ++misses;
    266         weighted_misses += weight;
    267         if (verbose) {
    268           std::cout << "Miss for URL: " << url
    269                     << " (prefix = "    << prefixes[i] << ")"
    270                     << std::endl;
    271         }
    272       }
    273     }
    274   }
    275 
    276   // Print out the results for this test.
    277   std::cout << "URLs checked: "      << url_count
    278             << ", prefix compares: " << prefix_count
    279             << ", hits: "            << hits
    280             << ", misses: "          << misses;
    281   if (use_weights) {
    282     std::cout << ", weighted hits: "   << weighted_hits
    283               << ", weighted misses: " << weighted_misses;
    284   }
    285   std::cout << std::endl;
    286 }
    287 
    288 }  // namespace
    289 
    290 // This test can take several minutes to perform its calculations, so it should
    291 // be disabled until you need to run it.
    292 TEST(SafeBrowsingBloomFilter, FalsePositives) {
    293   std::vector<SBPrefix> prefix_list;
    294   FilePath data_dir = GetFullDataPath();
    295   ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
    296 
    297   const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
    298 
    299   int start = BloomFilter::kBloomFilterSizeRatio;
    300   if (cmd_line.HasSwitch(kFilterStart)) {
    301     ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart),
    302                                   &start));
    303   }
    304 
    305   int steps = 1;
    306   if (cmd_line.HasSwitch(kFilterSteps)) {
    307     ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps),
    308                                   &steps));
    309   }
    310 
    311   int stop = start + steps;
    312 
    313   for (int multiplier = start; multiplier < stop; ++multiplier)
    314     CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list);
    315 }
    316 
    317 // Computes the time required for performing a number of look ups in a bloom
    318 // filter. This is useful for measuring the performance of new hash functions.
    319 TEST(SafeBrowsingBloomFilter, HashTime) {
    320   // Read the data from the database.
    321   std::vector<SBPrefix> prefix_list;
    322   FilePath data_dir = GetFullDataPath();
    323   ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
    324 
    325   const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
    326 
    327   int num_checks = kNumHashChecks;
    328   if (cmd_line.HasSwitch(kFilterNumChecks)) {
    329     ASSERT_TRUE(
    330         base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks),
    331                           &num_checks));
    332   }
    333 
    334   // Populate the bloom filter and measure the time.
    335   BloomFilter* bloom_filter = NULL;
    336   Time populate_before = Time::Now();
    337   BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio,
    338                    prefix_list, &bloom_filter);
    339   TimeDelta populate = Time::Now() - populate_before;
    340 
    341   // Check a large number of random prefixes against the filter.
    342   int hits = 0;
    343   Time check_before = Time::Now();
    344   for (int i = 0; i < num_checks; ++i) {
    345     uint32 prefix = static_cast<uint32>(base::RandUint64());
    346     if (bloom_filter->Exists(prefix))
    347       ++hits;
    348   }
    349   TimeDelta check = Time::Now() - check_before;
    350 
    351   int64 time_per_insert = populate.InMicroseconds() /
    352                           static_cast<int>(prefix_list.size());
    353   int64 time_per_check = check.InMicroseconds() / num_checks;
    354 
    355   std::cout << "Time results for checks: " << num_checks
    356             << ", prefixes: "              << prefix_list.size()
    357             << ", populate time (ms): "    << populate.InMilliseconds()
    358             << ", check time (ms): "       << check.InMilliseconds()
    359             << ", hits: "                  << hits
    360             << ", per-populate (us): "     << time_per_insert
    361             << ", per-check (us): "        << time_per_check
    362             << std::endl;
    363 }
    364