Home | History | Annotate | Download | only in bench
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2015 Benoit Jacob <benoitjacob (at) google.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #include <iostream>
     11 #include <cstdint>
     12 #include <cstdlib>
     13 #include <vector>
     14 #include <algorithm>
     15 #include <fstream>
     16 #include <string>
     17 #include <cmath>
     18 #include <cassert>
     19 #include <cstring>
     20 #include <memory>
     21 
     22 #include <Eigen/Core>
     23 
     24 using namespace std;
     25 
     26 const int default_precision = 4;
     27 
     28 // see --only-cubic-sizes
     29 bool only_cubic_sizes = false;
     30 
     31 // see --dump-tables
     32 bool dump_tables = false;
     33 
     34 uint8_t log2_pot(size_t x) {
     35   size_t l = 0;
     36   while (x >>= 1) l++;
     37   return l;
     38 }
     39 
     40 uint16_t compact_size_triple(size_t k, size_t m, size_t n)
     41 {
     42   return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
     43 }
     44 
     45 // just a helper to store a triple of K,M,N sizes for matrix product
     46 struct size_triple_t
     47 {
     48   uint16_t k, m, n;
     49   size_triple_t() : k(0), m(0), n(0) {}
     50   size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
     51   size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
     52   size_triple_t(uint16_t compact)
     53   {
     54     k = 1 << ((compact & 0xf00) >> 8);
     55     m = 1 << ((compact & 0x0f0) >> 4);
     56     n = 1 << ((compact & 0x00f) >> 0);
     57   }
     58   bool is_cubic() const { return k == m && m == n; }
     59 };
     60 
     61 ostream& operator<<(ostream& s, const size_triple_t& t)
     62 {
     63   return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
     64 }
     65 
     66 struct inputfile_entry_t
     67 {
     68   uint16_t product_size;
     69   uint16_t pot_block_size;
     70   size_triple_t nonpot_block_size;
     71   float gflops;
     72 };
     73 
     74 struct inputfile_t
     75 {
     76   enum class type_t {
     77     unknown,
     78     all_pot_sizes,
     79     default_sizes
     80   };
     81 
     82   string filename;
     83   vector<inputfile_entry_t> entries;
     84   type_t type;
     85 
     86   inputfile_t(const string& fname)
     87     : filename(fname)
     88     , type(type_t::unknown)
     89   {
     90     ifstream stream(filename);
     91     if (!stream.is_open()) {
     92       cerr << "couldn't open input file: " << filename << endl;
     93       exit(1);
     94     }
     95     string line;
     96     while (getline(stream, line)) {
     97       if (line.empty()) continue;
     98       if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
     99         if (type != type_t::unknown) {
    100           cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
    101           exit(1);
    102         }
    103         type = type_t::all_pot_sizes;
    104         continue;
    105       }
    106       if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
    107         if (type != type_t::unknown) {
    108           cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
    109           exit(1);
    110         }
    111         type = type_t::default_sizes;
    112         continue;
    113       }
    114 
    115 
    116       if (type == type_t::unknown) {
    117         continue;
    118       }
    119       switch(type) {
    120         case type_t::all_pot_sizes: {
    121           unsigned int product_size, block_size;
    122           float gflops;
    123           int sscanf_result =
    124             sscanf(line.c_str(), "%x %x %f",
    125                    &product_size,
    126                    &block_size,
    127                    &gflops);
    128           if (3 != sscanf_result ||
    129               !product_size ||
    130               product_size > 0xfff ||
    131               !block_size ||
    132               block_size > 0xfff ||
    133               !isfinite(gflops))
    134           {
    135             cerr << "ill-formed input file: " << filename << endl;
    136             cerr << "offending line:" << endl << line << endl;
    137             exit(1);
    138           }
    139           if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
    140             continue;
    141           }
    142           inputfile_entry_t entry;
    143           entry.product_size = uint16_t(product_size);
    144           entry.pot_block_size = uint16_t(block_size);
    145           entry.gflops = gflops;
    146           entries.push_back(entry);
    147           break;
    148         }
    149         case type_t::default_sizes: {
    150           unsigned int product_size;
    151           float gflops;
    152           int bk, bm, bn;
    153           int sscanf_result =
    154             sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
    155                    &product_size,
    156                    &bk, &bm, &bn,
    157                    &gflops);
    158           if (5 != sscanf_result ||
    159               !product_size ||
    160               product_size > 0xfff ||
    161               !isfinite(gflops))
    162           {
    163             cerr << "ill-formed input file: " << filename << endl;
    164             cerr << "offending line:" << endl << line << endl;
    165             exit(1);
    166           }
    167           if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
    168             continue;
    169           }
    170           inputfile_entry_t entry;
    171           entry.product_size = uint16_t(product_size);
    172           entry.pot_block_size = 0;
    173           entry.nonpot_block_size = size_triple_t(bk, bm, bn);
    174           entry.gflops = gflops;
    175           entries.push_back(entry);
    176           break;
    177         }
    178 
    179         default:
    180           break;
    181       }
    182     }
    183     stream.close();
    184     if (type == type_t::unknown) {
    185       cerr << "Unrecognized input file " << filename << endl;
    186       exit(1);
    187     }
    188     if (entries.empty()) {
    189       cerr << "didn't find any measurements in input file: " << filename << endl;
    190       exit(1);
    191     }
    192   }
    193 };
    194 
    195 struct preprocessed_inputfile_entry_t
    196 {
    197   uint16_t product_size;
    198   uint16_t block_size;
    199 
    200   float efficiency;
    201 };
    202 
    203 bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
    204 {
    205   return e1.efficiency < e2.efficiency;
    206 }
    207 
    208 struct preprocessed_inputfile_t
    209 {
    210   string filename;
    211   vector<preprocessed_inputfile_entry_t> entries;
    212 
    213   preprocessed_inputfile_t(const inputfile_t& inputfile)
    214     : filename(inputfile.filename)
    215   {
    216     if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
    217       abort();
    218     }
    219     auto it = inputfile.entries.begin();
    220     auto it_first_with_given_product_size = it;
    221     while (it != inputfile.entries.end()) {
    222       ++it;
    223       if (it == inputfile.entries.end() ||
    224         it->product_size != it_first_with_given_product_size->product_size)
    225       {
    226         import_input_file_range_one_product_size(it_first_with_given_product_size, it);
    227         it_first_with_given_product_size = it;
    228       }
    229     }
    230   }
    231 
    232 private:
    233   void import_input_file_range_one_product_size(
    234     const vector<inputfile_entry_t>::const_iterator& begin,
    235     const vector<inputfile_entry_t>::const_iterator& end)
    236   {
    237     uint16_t product_size = begin->product_size;
    238     float max_gflops = 0.0f;
    239     for (auto it = begin; it != end; ++it) {
    240       if (it->product_size != product_size) {
    241         cerr << "Unexpected ordering of entries in " << filename << endl;
    242         cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
    243         exit(1);
    244       }
    245       max_gflops = max(max_gflops, it->gflops);
    246     }
    247     for (auto it = begin; it != end; ++it) {
    248       preprocessed_inputfile_entry_t entry;
    249       entry.product_size = it->product_size;
    250       entry.block_size = it->pot_block_size;
    251       entry.efficiency = it->gflops / max_gflops;
    252       entries.push_back(entry);
    253     }
    254   }
    255 };
    256 
    257 void check_all_files_in_same_exact_order(
    258        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
    259 {
    260   if (preprocessed_inputfiles.empty()) {
    261     return;
    262   }
    263 
    264   const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
    265   const size_t num_entries = first_file.entries.size();
    266 
    267   for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
    268     if (preprocessed_inputfiles[i].entries.size() != num_entries) {
    269       cerr << "these files have different number of entries: "
    270            << preprocessed_inputfiles[i].filename
    271            << " and "
    272            << first_file.filename
    273            << endl;
    274       exit(1);
    275     }
    276   }
    277 
    278   for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
    279     const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
    280     const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
    281     for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
    282       const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
    283       if (cur_file.entries[entry_index].product_size != entry_product_size ||
    284           cur_file.entries[entry_index].block_size != entry_block_size)
    285       {
    286         cerr << "entries not in same order between these files: "
    287              << first_file.filename
    288              << " and "
    289              << cur_file.filename
    290              << endl;
    291         exit(1);
    292       }
    293     }
    294   }
    295 }
    296 
    297 float efficiency_of_subset(
    298         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
    299         const vector<size_t>& subset)
    300 {
    301   if (subset.size() <= 1) {
    302     return 1.0f;
    303   }
    304   const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
    305   const size_t num_entries = first_file.entries.size();
    306   float efficiency = 1.0f;
    307   size_t entry_index = 0;
    308   size_t first_entry_index_with_this_product_size = 0;
    309   uint16_t product_size = first_file.entries[0].product_size;
    310   while (entry_index < num_entries) {
    311     ++entry_index;
    312     if (entry_index == num_entries ||
    313         first_file.entries[entry_index].product_size != product_size)
    314     {
    315       float efficiency_this_product_size = 0.0f;
    316       for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
    317         float efficiency_this_entry = 1.0f;
    318         for (auto i = subset.begin(); i != subset.end(); ++i) {
    319           efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
    320         }
    321         efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
    322       }
    323       efficiency = min(efficiency, efficiency_this_product_size);
    324       if (entry_index < num_entries) {
    325         first_entry_index_with_this_product_size = entry_index;
    326         product_size = first_file.entries[entry_index].product_size;
    327       }
    328     }
    329   }
    330 
    331   return efficiency;
    332 }
    333 
    334 void dump_table_for_subset(
    335         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
    336         const vector<size_t>& subset)
    337 {
    338   const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
    339   const size_t num_entries = first_file.entries.size();
    340   size_t entry_index = 0;
    341   size_t first_entry_index_with_this_product_size = 0;
    342   uint16_t product_size = first_file.entries[0].product_size;
    343   size_t i = 0;
    344   size_triple_t min_product_size(first_file.entries.front().product_size);
    345   size_triple_t max_product_size(first_file.entries.back().product_size);
    346   if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
    347     abort();
    348   }
    349   if (only_cubic_sizes) {
    350     cerr << "Can't generate tables with --only-cubic-sizes." << endl;
    351     abort();
    352   }
    353   cout << "struct LookupTable {" << endl;
    354   cout << "  static const size_t BaseSize = " << min_product_size.k << ";" << endl;
    355   const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
    356   const size_t TableSize = NumSizes * NumSizes * NumSizes;
    357   cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
    358   cout << "  static const unsigned short* Data() {" << endl;
    359   cout << "    static const unsigned short data[" << TableSize << "] = {";
    360   while (entry_index < num_entries) {
    361     ++entry_index;
    362     if (entry_index == num_entries ||
    363         first_file.entries[entry_index].product_size != product_size)
    364     {
    365       float best_efficiency_this_product_size = 0.0f;
    366       uint16_t best_block_size_this_product_size = 0;
    367       for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
    368         float efficiency_this_entry = 1.0f;
    369         for (auto i = subset.begin(); i != subset.end(); ++i) {
    370           efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
    371         }
    372         if (efficiency_this_entry > best_efficiency_this_product_size) {
    373           best_efficiency_this_product_size = efficiency_this_entry;
    374           best_block_size_this_product_size = first_file.entries[e].block_size;
    375         }
    376       }
    377       if ((i++) % NumSizes) {
    378         cout << " ";
    379       } else {
    380         cout << endl << "      ";
    381       }
    382       cout << "0x" << hex << best_block_size_this_product_size << dec;
    383       if (entry_index < num_entries) {
    384         cout << ",";
    385         first_entry_index_with_this_product_size = entry_index;
    386         product_size = first_file.entries[entry_index].product_size;
    387       }
    388     }
    389   }
    390   if (i != TableSize) {
    391     cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
    392     abort();
    393   }
    394   cout << endl << "    };" << endl;
    395   cout << "    return data;" << endl;
    396   cout << "  }" << endl;
    397   cout << "};" << endl;
    398 }
    399 
    400 float efficiency_of_partition(
    401         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
    402         const vector<vector<size_t>>& partition)
    403 {
    404   float efficiency = 1.0f;
    405   for (auto s = partition.begin(); s != partition.end(); ++s) {
    406     efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
    407   }
    408   return efficiency;
    409 }
    410 
    411 void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
    412 {
    413   assert(subset_size >= 1 && subset_size <= set_size);
    414   out_subset.resize(subset_size);
    415   for (size_t i = 0; i < subset_size; i++) {
    416     out_subset[i] = i;
    417   }
    418 }
    419 
    420 bool is_last_subset(const vector<size_t>& subset, size_t set_size)
    421 {
    422   return subset[0] == set_size - subset.size();
    423 }
    424 
    425 void next_subset(vector<size_t>& inout_subset, size_t set_size)
    426 {
    427   if (is_last_subset(inout_subset, set_size)) {
    428     cerr << "iterating past the last subset" << endl;
    429     abort();
    430   }
    431   size_t i = 1;
    432   while (inout_subset[inout_subset.size() - i] == set_size - i) {
    433     i++;
    434     assert(i <= inout_subset.size());
    435   }
    436   size_t first_index_to_change = inout_subset.size() - i;
    437   inout_subset[first_index_to_change]++;
    438   size_t p = inout_subset[first_index_to_change];
    439   for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
    440     inout_subset[j] = ++p;
    441   }
    442 }
    443 
    444 const size_t number_of_subsets_limit = 100;
    445 const size_t always_search_subsets_of_size_at_least = 2;
    446 
    447 bool is_number_of_subsets_feasible(size_t n, size_t p)
    448 {
    449   assert(n>0 && p>0 && p<=n);
    450   uint64_t numerator = 1, denominator = 1;
    451   for (size_t i = 0; i < p; i++) {
    452     numerator *= n - i;
    453     denominator *= i + 1;
    454     if (numerator > denominator * number_of_subsets_limit) {
    455       return false;
    456     }
    457   }
    458   return true;
    459 }
    460 
    461 size_t max_feasible_subset_size(size_t n)
    462 {
    463   assert(n > 0);
    464   const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
    465   for (size_t p = 1; p <= n - 1; p++) {
    466     if (!is_number_of_subsets_feasible(n, p+1)) {
    467       return max(p, minresult);
    468     }
    469   }
    470   return n - 1;
    471 }
    472 
    473 void find_subset_with_efficiency_higher_than(
    474        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
    475        float required_efficiency_to_beat,
    476        vector<size_t>& inout_remainder,
    477        vector<size_t>& out_subset)
    478 {
    479   out_subset.resize(0);
    480 
    481   if (required_efficiency_to_beat >= 1.0f) {
    482     cerr << "can't beat efficiency 1." << endl;
    483     abort();
    484   }
    485 
    486   while (!inout_remainder.empty()) {
    487 
    488     vector<size_t> candidate_indices(inout_remainder.size());
    489     for (size_t i = 0; i < candidate_indices.size(); i++) {
    490       candidate_indices[i] = i;
    491     }
    492 
    493     size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
    494     while (candidate_indices_subset_size >= 1) {
    495       vector<size_t> candidate_indices_subset;
    496       make_first_subset(candidate_indices_subset_size,
    497                         candidate_indices_subset,
    498                         candidate_indices.size());
    499 
    500       vector<size_t> best_candidate_indices_subset;
    501       float best_efficiency = 0.0f;
    502       vector<size_t> trial_subset = out_subset;
    503       trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
    504       while (true)
    505       {
    506         for (size_t i = 0; i < candidate_indices_subset_size; i++) {
    507           trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
    508         }
    509 
    510         float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
    511         if (trial_efficiency > best_efficiency) {
    512           best_efficiency = trial_efficiency;
    513           best_candidate_indices_subset = candidate_indices_subset;
    514         }
    515         if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
    516           break;
    517         }
    518         next_subset(candidate_indices_subset, candidate_indices.size());
    519       }
    520 
    521       if (best_efficiency > required_efficiency_to_beat) {
    522         for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
    523           candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
    524         }
    525         candidate_indices.resize(best_candidate_indices_subset.size());
    526       }
    527       candidate_indices_subset_size--;
    528     }
    529 
    530     size_t candidate_index = candidate_indices[0];
    531     auto candidate_iterator = inout_remainder.begin() + candidate_index;
    532     vector<size_t> trial_subset = out_subset;
    533 
    534     trial_subset.push_back(*candidate_iterator);
    535     float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
    536     if (trial_efficiency > required_efficiency_to_beat) {
    537       out_subset.push_back(*candidate_iterator);
    538       inout_remainder.erase(candidate_iterator);
    539     } else {
    540       break;
    541     }
    542   }
    543 }
    544 
    545 void find_partition_with_efficiency_higher_than(
    546        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
    547        float required_efficiency_to_beat,
    548        vector<vector<size_t>>& out_partition)
    549 {
    550   out_partition.resize(0);
    551 
    552   vector<size_t> remainder;
    553   for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
    554     remainder.push_back(i);
    555   }
    556 
    557   while (!remainder.empty()) {
    558     vector<size_t> new_subset;
    559     find_subset_with_efficiency_higher_than(
    560       preprocessed_inputfiles,
    561       required_efficiency_to_beat,
    562       remainder,
    563       new_subset);
    564     out_partition.push_back(new_subset);
    565   }
    566 }
    567 
    568 void print_partition(
    569        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
    570        const vector<vector<size_t>>& partition)
    571 {
    572   float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
    573   cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency"  << endl;
    574   for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
    575     cout << "  Subset " << (subset - partition.begin())
    576          << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
    577          << endl;
    578     for (auto file = subset->begin(); file != subset->end(); ++file) {
    579       cout << "    " << preprocessed_inputfiles[*file].filename << endl;
    580     }
    581     if (dump_tables) {
    582       cout << "  Table:" << endl;
    583       dump_table_for_subset(preprocessed_inputfiles, *subset);
    584     }
    585   }
    586   cout << endl;
    587 }
    588 
    589 struct action_t
    590 {
    591   virtual const char* invokation_name() const { abort(); return nullptr; }
    592   virtual void run(const vector<string>&) const { abort(); }
    593   virtual ~action_t() {}
    594 };
    595 
    596 struct partition_action_t : action_t
    597 {
    598   virtual const char* invokation_name() const override { return "partition"; }
    599   virtual void run(const vector<string>& input_filenames) const override
    600   {
    601     vector<preprocessed_inputfile_t> preprocessed_inputfiles;
    602 
    603     if (input_filenames.empty()) {
    604       cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
    605       exit(1);
    606     }
    607 
    608     for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
    609       inputfile_t inputfile(*it);
    610       switch (inputfile.type) {
    611         case inputfile_t::type_t::all_pot_sizes:
    612           preprocessed_inputfiles.emplace_back(inputfile);
    613           break;
    614         case inputfile_t::type_t::default_sizes:
    615           cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
    616                << "has no use for " << *it << " which contains measurements for default sizes." << endl;
    617           exit(1);
    618           break;
    619         default:
    620           cerr << "Unrecognized input file: " << *it << endl;
    621           exit(1);
    622       }
    623     }
    624 
    625     check_all_files_in_same_exact_order(preprocessed_inputfiles);
    626 
    627     float required_efficiency_to_beat = 0.0f;
    628     vector<vector<vector<size_t>>> partitions;
    629     cerr << "searching for partitions...\r" << flush;
    630     while (true)
    631     {
    632       vector<vector<size_t>> partition;
    633       find_partition_with_efficiency_higher_than(
    634         preprocessed_inputfiles,
    635         required_efficiency_to_beat,
    636         partition);
    637       float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
    638       cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
    639            << " subsets for " << 100.0f * actual_efficiency
    640            << " % efficiency"
    641            << "                  \r" << flush;
    642       partitions.push_back(partition);
    643       if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
    644         break;
    645       }
    646       required_efficiency_to_beat = actual_efficiency;
    647     }
    648     cerr << "                                                                  " << endl;
    649     while (true) {
    650       bool repeat = false;
    651       for (size_t i = 0; i < partitions.size() - 1; i++) {
    652         if (partitions[i].size() >= partitions[i+1].size()) {
    653           partitions.erase(partitions.begin() + i);
    654           repeat = true;
    655           break;
    656         }
    657       }
    658       if (!repeat) {
    659         break;
    660       }
    661     }
    662     for (auto it = partitions.begin(); it != partitions.end(); ++it) {
    663       print_partition(preprocessed_inputfiles, *it);
    664     }
    665   }
    666 };
    667 
    668 struct evaluate_defaults_action_t : action_t
    669 {
    670   struct results_entry_t {
    671     uint16_t product_size;
    672     size_triple_t default_block_size;
    673     uint16_t best_pot_block_size;
    674     float default_gflops;
    675     float best_pot_gflops;
    676     float default_efficiency;
    677   };
    678   friend ostream& operator<<(ostream& s, const results_entry_t& entry)
    679   {
    680     return s
    681       << "Product size " << size_triple_t(entry.product_size)
    682       << ": default block size " << entry.default_block_size
    683       << " -> " << entry.default_gflops
    684       << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
    685       << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
    686       << " -> " << entry.best_pot_gflops
    687       << " GFlop/s" << dec;
    688   }
    689   static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
    690     return e1.default_efficiency < e2.default_efficiency;
    691   }
    692   virtual const char* invokation_name() const override { return "evaluate-defaults"; }
    693   void show_usage_and_exit() const
    694   {
    695     cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
    696     cerr << "checks how well the performance with default sizes compares to the best "
    697          << "performance measured over all POT sizes." << endl;
    698     exit(1);
    699   }
    700   virtual void run(const vector<string>& input_filenames) const override
    701   {
    702     if (input_filenames.size() != 2) {
    703       show_usage_and_exit();
    704     }
    705     inputfile_t inputfile_default_sizes(input_filenames[0]);
    706     inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
    707     if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
    708       cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
    709       show_usage_and_exit();
    710     }
    711     if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
    712       cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
    713       show_usage_and_exit();
    714     }
    715     vector<results_entry_t> results;
    716     vector<results_entry_t> cubic_results;
    717 
    718     uint16_t product_size = 0;
    719     auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
    720     for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
    721          it_default_sizes != inputfile_default_sizes.entries.end();
    722          ++it_default_sizes)
    723     {
    724       if (it_default_sizes->product_size == product_size) {
    725         continue;
    726       }
    727       product_size = it_default_sizes->product_size;
    728       while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
    729              it_all_pot_sizes->product_size != product_size)
    730       {
    731         ++it_all_pot_sizes;
    732       }
    733       if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
    734         break;
    735       }
    736       uint16_t best_pot_block_size = 0;
    737       float best_pot_gflops = 0;
    738       for (auto it = it_all_pot_sizes;
    739            it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
    740            ++it)
    741       {
    742         if (it->gflops > best_pot_gflops) {
    743           best_pot_gflops = it->gflops;
    744           best_pot_block_size = it->pot_block_size;
    745         }
    746       }
    747       results_entry_t entry;
    748       entry.product_size = product_size;
    749       entry.default_block_size = it_default_sizes->nonpot_block_size;
    750       entry.best_pot_block_size = best_pot_block_size;
    751       entry.default_gflops = it_default_sizes->gflops;
    752       entry.best_pot_gflops = best_pot_gflops;
    753       entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
    754       results.push_back(entry);
    755 
    756       size_triple_t t(product_size);
    757       if (t.k == t.m && t.m == t.n) {
    758         cubic_results.push_back(entry);
    759       }
    760     }
    761 
    762     cout << "All results:" << endl;
    763     for (auto it = results.begin(); it != results.end(); ++it) {
    764       cout << *it << endl;
    765     }
    766     cout << endl;
    767 
    768     sort(results.begin(), results.end(), lower_efficiency);
    769 
    770     const size_t n = min<size_t>(20, results.size());
    771     cout << n << " worst results:" << endl;
    772     for (size_t i = 0; i < n; i++) {
    773       cout << results[i] << endl;
    774     }
    775     cout << endl;
    776 
    777     cout << "cubic results:" << endl;
    778     for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
    779       cout << *it << endl;
    780     }
    781     cout << endl;
    782 
    783     sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
    784 
    785     cout.precision(2);
    786     vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
    787     for (auto it = a.begin(); it != a.end(); ++it) {
    788       size_t n = min(results.size() - 1, size_t(*it * results.size()));
    789       cout << (100.0f * n / (results.size() - 1))
    790            << " % of product sizes have default efficiency <= "
    791            << 100.0f * results[n].default_efficiency << " %" << endl;
    792     }
    793     cout.precision(default_precision);
    794   }
    795 };
    796 
    797 
    798 void show_usage_and_exit(int argc, char* argv[],
    799                          const vector<unique_ptr<action_t>>& available_actions)
    800 {
    801   cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
    802   cerr << "available actions:" << endl;
    803   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
    804     cerr << "  " << (*it)->invokation_name() << endl;
    805   }
    806   cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
    807   exit(1);
    808 }
    809 
    810 int main(int argc, char* argv[])
    811 {
    812   cout.precision(default_precision);
    813   cerr.precision(default_precision);
    814 
    815   vector<unique_ptr<action_t>> available_actions;
    816   available_actions.emplace_back(new partition_action_t);
    817   available_actions.emplace_back(new evaluate_defaults_action_t);
    818 
    819   vector<string> input_filenames;
    820 
    821   action_t* action = nullptr;
    822 
    823   if (argc < 2) {
    824     show_usage_and_exit(argc, argv, available_actions);
    825   }
    826   for (int i = 1; i < argc; i++) {
    827     bool arg_handled = false;
    828     // Step 1. Try to match action invokation names.
    829     for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
    830       if (!strcmp(argv[i], (*it)->invokation_name())) {
    831         if (!action) {
    832           action = it->get();
    833           arg_handled = true;
    834           break;
    835         } else {
    836           cerr << "can't specify more than one action!" << endl;
    837           show_usage_and_exit(argc, argv, available_actions);
    838         }
    839       }
    840     }
    841     if (arg_handled) {
    842       continue;
    843     }
    844     // Step 2. Try to match option names.
    845     if (argv[i][0] == '-') {
    846       if (!strcmp(argv[i], "--only-cubic-sizes")) {
    847         only_cubic_sizes = true;
    848         arg_handled = true;
    849       }
    850       if (!strcmp(argv[i], "--dump-tables")) {
    851         dump_tables = true;
    852         arg_handled = true;
    853       }
    854       if (!arg_handled) {
    855         cerr << "Unrecognized option: " << argv[i] << endl;
    856         show_usage_and_exit(argc, argv, available_actions);
    857       }
    858     }
    859     if (arg_handled) {
    860       continue;
    861     }
    862     // Step 3. Default to interpreting args as input filenames.
    863     input_filenames.emplace_back(argv[i]);
    864   }
    865 
    866   if (dump_tables && only_cubic_sizes) {
    867     cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
    868     show_usage_and_exit(argc, argv, available_actions);
    869   }
    870 
    871   if (!action) {
    872     show_usage_and_exit(argc, argv, available_actions);
    873   }
    874 
    875   action->run(input_filenames);
    876 }
    877