Home | History | Annotate | Download | only in bench
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2015 Benoit Jacob <benoitjacob (at) google.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #include <iostream>
     11 #include <cstdint>
     12 #include <cstdlib>
     13 #include <vector>
     14 #include <fstream>
     15 #include <memory>
     16 #include <cstdio>
     17 
     18 bool eigen_use_specific_block_size;
     19 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
     20 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
     21 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
     22 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
     23 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
     24 #include <Eigen/Core>
     25 
     26 #include <bench/BenchTimer.h>
     27 
     28 using namespace Eigen;
     29 using namespace std;
     30 
     31 static BenchTimer timer;
     32 
     33 // how many times we repeat each measurement.
     34 // measurements are randomly shuffled - we're not doing
     35 // all N identical measurements in a row.
     36 const int measurement_repetitions = 3;
     37 
     38 // Timings below this value are too short to be accurate,
     39 // we'll repeat measurements with more iterations until
     40 // we get a timing above that threshold.
     41 const float min_accurate_time = 1e-2f;
     42 
     43 // See --min-working-set-size command line parameter.
     44 size_t min_working_set_size = 0;
     45 
     46 float max_clock_speed = 0.0f;
     47 
     48 // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
     49 const size_t maxsize = 2048;
     50 const size_t minsize = 16;
     51 
     52 typedef MatrixXf MatrixType;
     53 typedef MatrixType::Scalar Scalar;
     54 typedef internal::packet_traits<Scalar>::type Packet;
     55 
     56 static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
     57 static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
     58 static_assert(maxsize > minsize, "maxsize must be larger than minsize");
     59 static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
     60 
     61 // just a helper to store a triple of K,M,N sizes for matrix product
     62 struct size_triple_t
     63 {
     64   size_t k, m, n;
     65   size_triple_t() : k(0), m(0), n(0) {}
     66   size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
     67   size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
     68   size_triple_t(uint16_t compact)
     69   {
     70     k = 1 << ((compact & 0xf00) >> 8);
     71     m = 1 << ((compact & 0x0f0) >> 4);
     72     n = 1 << ((compact & 0x00f) >> 0);
     73   }
     74 };
     75 
     76 uint8_t log2_pot(size_t x) {
     77   size_t l = 0;
     78   while (x >>= 1) l++;
     79   return l;
     80 }
     81 
     82 // Convert between size tripes and a compact form fitting in 12 bits
     83 // where each size, which must be a POT, is encoded as its log2, on 4 bits
     84 // so the largest representable size is 2^15 == 32k  ... big enough.
     85 uint16_t compact_size_triple(size_t k, size_t m, size_t n)
     86 {
     87   return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
     88 }
     89 
     90 uint16_t compact_size_triple(const size_triple_t& t)
     91 {
     92   return compact_size_triple(t.k, t.m, t.n);
     93 }
     94 
     95 // A single benchmark. Initially only contains benchmark params.
     96 // Then call run(), which stores the result in the gflops field.
     97 struct benchmark_t
     98 {
     99   uint16_t compact_product_size;
    100   uint16_t compact_block_size;
    101   bool use_default_block_size;
    102   float gflops;
    103   benchmark_t()
    104     : compact_product_size(0)
    105     , compact_block_size(0)
    106     , use_default_block_size(false)
    107     , gflops(0)
    108   {
    109   }
    110   benchmark_t(size_t pk, size_t pm, size_t pn,
    111               size_t bk, size_t bm, size_t bn)
    112     : compact_product_size(compact_size_triple(pk, pm, pn))
    113     , compact_block_size(compact_size_triple(bk, bm, bn))
    114     , use_default_block_size(false)
    115     , gflops(0)
    116   {}
    117   benchmark_t(size_t pk, size_t pm, size_t pn)
    118     : compact_product_size(compact_size_triple(pk, pm, pn))
    119     , compact_block_size(0)
    120     , use_default_block_size(true)
    121     , gflops(0)
    122   {}
    123 
    124   void run();
    125 };
    126 
    127 ostream& operator<<(ostream& s, const benchmark_t& b)
    128 {
    129   s << hex << b.compact_product_size << dec;
    130   if (b.use_default_block_size) {
    131     size_triple_t t(b.compact_product_size);
    132     Index k = t.k, m = t.m, n = t.n;
    133     internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
    134     s << " default(" << k << ", " << m << ", " << n << ")";
    135   } else {
    136     s << " " << hex << b.compact_block_size << dec;
    137   }
    138   s << " " << b.gflops;
    139   return s;
    140 }
    141 
    142 // We sort first by increasing benchmark parameters,
    143 // then by decreasing performance.
    144 bool operator<(const benchmark_t& b1, const benchmark_t& b2)
    145 {
    146   return b1.compact_product_size < b2.compact_product_size ||
    147            (b1.compact_product_size == b2.compact_product_size && (
    148              (b1.compact_block_size < b2.compact_block_size || (
    149                b1.compact_block_size == b2.compact_block_size &&
    150                  b1.gflops > b2.gflops))));
    151 }
    152 
    153 void benchmark_t::run()
    154 {
    155   size_triple_t productsizes(compact_product_size);
    156 
    157   if (use_default_block_size) {
    158     eigen_use_specific_block_size = false;
    159   } else {
    160     // feed eigen with our custom blocking params
    161     eigen_use_specific_block_size = true;
    162     size_triple_t blocksizes(compact_block_size);
    163     eigen_block_size_k = blocksizes.k;
    164     eigen_block_size_m = blocksizes.m;
    165     eigen_block_size_n = blocksizes.n;
    166   }
    167 
    168   // set up the matrix pool
    169 
    170   const size_t combined_three_matrices_sizes =
    171     sizeof(Scalar) *
    172       (productsizes.k * productsizes.m +
    173        productsizes.k * productsizes.n +
    174        productsizes.m * productsizes.n);
    175 
    176   // 64 M is large enough that nobody has a cache bigger than that,
    177   // while still being small enough that everybody has this much RAM,
    178   // so conveniently we don't need to special-case platforms here.
    179   const size_t unlikely_large_cache_size = 64 << 20;
    180 
    181   const size_t working_set_size =
    182     min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
    183 
    184   const size_t matrix_pool_size =
    185     1 + working_set_size / combined_three_matrices_sizes;
    186 
    187   MatrixType *lhs = new MatrixType[matrix_pool_size];
    188   MatrixType *rhs = new MatrixType[matrix_pool_size];
    189   MatrixType *dst = new MatrixType[matrix_pool_size];
    190 
    191   for (size_t i = 0; i < matrix_pool_size; i++) {
    192     lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
    193     rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
    194     dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
    195   }
    196 
    197   // main benchmark loop
    198 
    199   int iters_at_a_time = 1;
    200   float time_per_iter = 0.0f;
    201   size_t matrix_index = 0;
    202   while (true) {
    203 
    204     double starttime = timer.getCpuTime();
    205     for (int i = 0; i < iters_at_a_time; i++) {
    206       dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
    207       matrix_index++;
    208       if (matrix_index == matrix_pool_size) {
    209         matrix_index = 0;
    210       }
    211     }
    212     double endtime = timer.getCpuTime();
    213 
    214     const float timing = float(endtime - starttime);
    215 
    216     if (timing >= min_accurate_time) {
    217       time_per_iter = timing / iters_at_a_time;
    218       break;
    219     }
    220 
    221     iters_at_a_time *= 2;
    222   }
    223 
    224   delete[] lhs;
    225   delete[] rhs;
    226   delete[] dst;
    227 
    228   gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
    229 }
    230 
    231 void print_cpuinfo()
    232 {
    233 #ifdef __linux__
    234   cout << "contents of /proc/cpuinfo:" << endl;
    235   string line;
    236   ifstream cpuinfo("/proc/cpuinfo");
    237   if (cpuinfo.is_open()) {
    238     while (getline(cpuinfo, line)) {
    239       cout << line << endl;
    240     }
    241     cpuinfo.close();
    242   }
    243   cout << endl;
    244 #elif defined __APPLE__
    245   cout << "output of sysctl hw:" << endl;
    246   system("sysctl hw");
    247   cout << endl;
    248 #endif
    249 }
    250 
    251 template <typename T>
    252 string type_name()
    253 {
    254   return "unknown";
    255 }
    256 
    257 template<>
    258 string type_name<float>()
    259 {
    260   return "float";
    261 }
    262 
    263 template<>
    264 string type_name<double>()
    265 {
    266   return "double";
    267 }
    268 
    269 struct action_t
    270 {
    271   virtual const char* invokation_name() const { abort(); return nullptr; }
    272   virtual void run() const { abort(); }
    273   virtual ~action_t() {}
    274 };
    275 
    276 void show_usage_and_exit(int /*argc*/, char* argv[],
    277                          const vector<unique_ptr<action_t>>& available_actions)
    278 {
    279   cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
    280   cerr << "available actions:" << endl << endl;
    281   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
    282     cerr << "  " << (*it)->invokation_name() << endl;
    283   }
    284   cerr << endl;
    285   cerr << "options:" << endl << endl;
    286   cerr << "  --min-working-set-size=N:" << endl;
    287   cerr << "       Set the minimum working set size to N bytes." << endl;
    288   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
    289   cerr << "       A larger working set lowers the chance of a warm cache." << endl;
    290   cerr << "       The default value 0 means use a large enough working" << endl;
    291   cerr << "       set to likely outsize caches." << endl;
    292   cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
    293   cerr << "       avoid warm caches." << endl;
    294   exit(1);
    295 }
    296 
    297 float measure_clock_speed()
    298 {
    299   cerr << "Measuring clock speed...                              \r" << flush;
    300 
    301   vector<float> all_gflops;
    302   for (int i = 0; i < 8; i++) {
    303     benchmark_t b(1024, 1024, 1024);
    304     b.run();
    305     all_gflops.push_back(b.gflops);
    306   }
    307 
    308   sort(all_gflops.begin(), all_gflops.end());
    309   float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
    310 
    311   // multiply by an arbitrary constant to discourage trying doing anything with the
    312   // returned values besides just comparing them with each other.
    313   float result = stable_estimate * 123.456f;
    314 
    315   return result;
    316 }
    317 
    318 struct human_duration_t
    319 {
    320   int seconds;
    321   human_duration_t(int s) : seconds(s) {}
    322 };
    323 
    324 ostream& operator<<(ostream& s, const human_duration_t& d)
    325 {
    326   int remainder = d.seconds;
    327   if (remainder > 3600) {
    328     int hours = remainder / 3600;
    329     s << hours << " h ";
    330     remainder -= hours * 3600;
    331   }
    332   if (remainder > 60) {
    333     int minutes = remainder / 60;
    334     s << minutes << " min ";
    335     remainder -= minutes * 60;
    336   }
    337   if (d.seconds < 600) {
    338     s << remainder << " s";
    339   }
    340   return s;
    341 }
    342 
    343 const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
    344 
    345 void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
    346 {
    347   FILE* file = fopen(filename, "w");
    348   if (!file) {
    349     cerr << "Could not open file " << filename << " for writing." << endl;
    350     cerr << "Do you have write permissions on the current working directory?" << endl;
    351     exit(1);
    352   }
    353   size_t benchmarks_vector_size = benchmarks.size();
    354   fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
    355   fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
    356   fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
    357   fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
    358   fclose(file);
    359 }
    360 
    361 bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
    362 {
    363   FILE* file = fopen(filename, "r");
    364   if (!file) {
    365     return false;
    366   }
    367   if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
    368     return false;
    369   }
    370   size_t benchmarks_vector_size = 0;
    371   if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
    372     return false;
    373   }
    374   if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
    375     return false;
    376   }
    377   benchmarks.resize(benchmarks_vector_size);
    378   if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
    379     return false;
    380   }
    381   unlink(filename);
    382   return true;
    383 }
    384 
    385 void try_run_some_benchmarks(
    386   vector<benchmark_t>& benchmarks,
    387   double time_start,
    388   size_t& first_benchmark_to_run)
    389 {
    390   if (first_benchmark_to_run == benchmarks.size()) {
    391     return;
    392   }
    393 
    394   double time_last_progress_update = 0;
    395   double time_last_clock_speed_measurement = 0;
    396   double time_now = 0;
    397 
    398   size_t benchmark_index = first_benchmark_to_run;
    399 
    400   while (true) {
    401     float ratio_done = float(benchmark_index) / benchmarks.size();
    402     time_now = timer.getRealTime();
    403 
    404     // We check clock speed every minute and at the end.
    405     if (benchmark_index == benchmarks.size() ||
    406         time_now > time_last_clock_speed_measurement + 60.0f)
    407     {
    408       time_last_clock_speed_measurement = time_now;
    409 
    410       // Ensure that clock speed is as expected
    411       float current_clock_speed = measure_clock_speed();
    412 
    413       // The tolerance needs to be smaller than the relative difference between
    414       // clock speeds that a device could operate under.
    415       // It seems unlikely that a device would be throttling clock speeds by
    416       // amounts smaller than 2%.
    417       // With a value of 1%, I was getting within noise on a Sandy Bridge.
    418       const float clock_speed_tolerance = 0.02f;
    419 
    420       if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
    421         // Clock speed is now higher than we previously measured.
    422         // Either our initial measurement was inaccurate, which won't happen
    423         // too many times as we are keeping the best clock speed value and
    424         // and allowing some tolerance; or something really weird happened,
    425         // which invalidates all benchmark results collected so far.
    426         // Either way, we better restart all over again now.
    427         if (benchmark_index) {
    428           cerr << "Restarting at " << 100.0f * ratio_done
    429                << " % because clock speed increased.          " << endl;
    430         }
    431         max_clock_speed = current_clock_speed;
    432         first_benchmark_to_run = 0;
    433         return;
    434       }
    435 
    436       bool rerun_last_tests = false;
    437 
    438       if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
    439         cerr << "Measurements completed so far: "
    440              << 100.0f * ratio_done
    441              << " %                             " << endl;
    442         cerr << "Clock speed seems to be only "
    443              << current_clock_speed/max_clock_speed
    444              << " times what it used to be." << endl;
    445 
    446         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
    447 
    448         while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
    449           if (seconds_to_sleep_if_lower_clock_speed > 32) {
    450             cerr << "Sleeping longer probably won't make a difference." << endl;
    451             cerr << "Serializing benchmarks to " << session_filename << endl;
    452             serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
    453             cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
    454             exit(2);
    455           }
    456           rerun_last_tests = true;
    457           cerr << "Sleeping "
    458                << seconds_to_sleep_if_lower_clock_speed
    459                << " s...                                   \r" << endl;
    460           sleep(seconds_to_sleep_if_lower_clock_speed);
    461           current_clock_speed = measure_clock_speed();
    462           seconds_to_sleep_if_lower_clock_speed *= 2;
    463         }
    464       }
    465 
    466       if (rerun_last_tests) {
    467         cerr << "Redoing the last "
    468              << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
    469              << " % because clock speed had been low.   " << endl;
    470         return;
    471       }
    472 
    473       // nothing wrong with the clock speed so far, so there won't be a need to rerun
    474       // benchmarks run so far in case we later encounter a lower clock speed.
    475       first_benchmark_to_run = benchmark_index;
    476     }
    477 
    478     if (benchmark_index == benchmarks.size()) {
    479       // We're done!
    480       first_benchmark_to_run = benchmarks.size();
    481       // Erase progress info
    482       cerr << "                                                            " << endl;
    483       return;
    484     }
    485 
    486     // Display progress info on stderr
    487     if (time_now > time_last_progress_update + 1.0f) {
    488       time_last_progress_update = time_now;
    489       cerr << "Measurements... " << 100.0f * ratio_done
    490            << " %, ETA "
    491            << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
    492            << "                          \r" << flush;
    493     }
    494 
    495     // This is where we actually run a benchmark!
    496     benchmarks[benchmark_index].run();
    497     benchmark_index++;
    498   }
    499 }
    500 
    501 void run_benchmarks(vector<benchmark_t>& benchmarks)
    502 {
    503   size_t first_benchmark_to_run;
    504   vector<benchmark_t> deserialized_benchmarks;
    505   bool use_deserialized_benchmarks = false;
    506   if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
    507     cerr << "Found serialized session with "
    508          << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
    509          << " % already done" << endl;
    510     if (deserialized_benchmarks.size() == benchmarks.size() &&
    511         first_benchmark_to_run > 0 &&
    512         first_benchmark_to_run < benchmarks.size())
    513     {
    514       use_deserialized_benchmarks = true;
    515     }
    516   }
    517 
    518   if (use_deserialized_benchmarks) {
    519     benchmarks = deserialized_benchmarks;
    520   } else {
    521     // not using deserialized benchmarks, starting from scratch
    522     first_benchmark_to_run = 0;
    523 
    524     // Randomly shuffling benchmarks allows us to get accurate enough progress info,
    525     // as now the cheap/expensive benchmarks are randomly mixed so they average out.
    526     // It also means that if data is corrupted for some time span, the odds are that
    527     // not all repetitions of a given benchmark will be corrupted.
    528     random_shuffle(benchmarks.begin(), benchmarks.end());
    529   }
    530 
    531   for (int i = 0; i < 4; i++) {
    532     max_clock_speed = max(max_clock_speed, measure_clock_speed());
    533   }
    534 
    535   double time_start = 0.0;
    536   while (first_benchmark_to_run < benchmarks.size()) {
    537     if (first_benchmark_to_run == 0) {
    538       time_start = timer.getRealTime();
    539     }
    540     try_run_some_benchmarks(benchmarks,
    541                             time_start,
    542                             first_benchmark_to_run);
    543   }
    544 
    545   // Sort timings by increasing benchmark parameters, and decreasing gflops.
    546   // The latter is very important. It means that we can ignore all but the first
    547   // benchmark with given parameters.
    548   sort(benchmarks.begin(), benchmarks.end());
    549 
    550   // Collect best (i.e. now first) results for each parameter values.
    551   vector<benchmark_t> best_benchmarks;
    552   for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
    553     if (best_benchmarks.empty() ||
    554         best_benchmarks.back().compact_product_size != it->compact_product_size ||
    555         best_benchmarks.back().compact_block_size != it->compact_block_size)
    556     {
    557       best_benchmarks.push_back(*it);
    558     }
    559   }
    560 
    561   // keep and return only the best benchmarks
    562   benchmarks = best_benchmarks;
    563 }
    564 
    565 struct measure_all_pot_sizes_action_t : action_t
    566 {
    567   virtual const char* invokation_name() const { return "all-pot-sizes"; }
    568   virtual void run() const
    569   {
    570     vector<benchmark_t> benchmarks;
    571     for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
    572       for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
    573         for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
    574           for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
    575             for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
    576               for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
    577                 for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
    578                   benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
    579                 }
    580               }
    581             }
    582           }
    583         }
    584       }
    585     }
    586 
    587     run_benchmarks(benchmarks);
    588 
    589     cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
    590     for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
    591       cout << *it << endl;
    592     }
    593   }
    594 };
    595 
    596 struct measure_default_sizes_action_t : action_t
    597 {
    598   virtual const char* invokation_name() const { return "default-sizes"; }
    599   virtual void run() const
    600   {
    601     vector<benchmark_t> benchmarks;
    602     for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
    603       for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
    604         for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
    605           for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
    606             benchmarks.emplace_back(ksize, msize, nsize);
    607           }
    608         }
    609       }
    610     }
    611 
    612     run_benchmarks(benchmarks);
    613 
    614     cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
    615     for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
    616       cout << *it << endl;
    617     }
    618   }
    619 };
    620 
    621 int main(int argc, char* argv[])
    622 {
    623   double time_start = timer.getRealTime();
    624   cout.precision(4);
    625   cerr.precision(4);
    626 
    627   vector<unique_ptr<action_t>> available_actions;
    628   available_actions.emplace_back(new measure_all_pot_sizes_action_t);
    629   available_actions.emplace_back(new measure_default_sizes_action_t);
    630 
    631   auto action = available_actions.end();
    632 
    633   if (argc <= 1) {
    634     show_usage_and_exit(argc, argv, available_actions);
    635   }
    636   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
    637     if (!strcmp(argv[1], (*it)->invokation_name())) {
    638       action = it;
    639       break;
    640     }
    641   }
    642 
    643   if (action == available_actions.end()) {
    644     show_usage_and_exit(argc, argv, available_actions);
    645   }
    646 
    647   for (int i = 2; i < argc; i++) {
    648     if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
    649       const char* equals_sign = strchr(argv[i], '=');
    650       min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
    651     } else {
    652       cerr << "unrecognized option: " << argv[i] << endl << endl;
    653       show_usage_and_exit(argc, argv, available_actions);
    654     }
    655   }
    656 
    657   print_cpuinfo();
    658 
    659   cout << "benchmark parameters:" << endl;
    660   cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
    661   cout << "scalar type: " << type_name<Scalar>() << endl;
    662   cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
    663   cout << "minsize = " << minsize << endl;
    664   cout << "maxsize = " << maxsize << endl;
    665   cout << "measurement_repetitions = " << measurement_repetitions << endl;
    666   cout << "min_accurate_time = " << min_accurate_time << endl;
    667   cout << "min_working_set_size = " << min_working_set_size;
    668   if (min_working_set_size == 0) {
    669     cout << " (try to outsize caches)";
    670   }
    671   cout << endl << endl;
    672 
    673   (*action)->run();
    674 
    675   double time_end = timer.getRealTime();
    676   cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
    677 }
    678