Home | History | Annotate | Download | only in test
      1 // Example command line to build on Android ARM64:
      2 /*
      3 ~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
      4 test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
      5 -DBENCHMARK_QUICK -DBENCHMARK_8bit
      6 */
      7 
      8 #include <algorithm>
      9 #include <cmath>
     10 #include <cstdint>
     11 #include <ctime>
     12 #include <iostream>
     13 #include <map>
     14 #include <random>
     15 #include <set>
     16 
     17 #include "../public/gemmlowp.h"
     18 
     19 #if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
     20 // Compilation workaround
     21 namespace std {
     22   using ::round;
     23 }
     24 #endif
     25 
     26 // Minimum duration of each benchmark measurement. Also, duration
     27 // of sleep time between each two consecutive benchmark measurements to
     28 // prevent over-heating.
     29 const double kBenchmarkSecs = 0.1;
     30 
     31 // Sleep time before each benchmark.
     32 const int kCooldownBeforeBenchmarkSecs = 0;
     33 
     34 // Number of benchmark passes.
     35 const int kPasses = 4;
     36 
     37 #ifdef BENCHMARK_NUM_THREADS
     38 const int kNumThreads = BENCHMARK_NUM_THREADS;
     39 #else
     40 const int kNumThreads = 1;
     41 #endif
     42 
     43 namespace gemmlowp {
     44 
     45 // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
     46 // since it only maps existing data. In tests though, we need to
     47 // create our own matrices.
     48 template <typename tScalar, MapOrder tOrder>
     49 class Matrix : public MatrixMap<tScalar, tOrder> {
     50  public:
     51   typedef MatrixMap<tScalar, tOrder> Map;
     52   typedef MatrixMap<const tScalar, tOrder> ConstMap;
     53   typedef typename Map::Scalar Scalar;
     54   static const MapOrder Order = tOrder;
     55   using Map::cols_;
     56   using Map::data_;
     57   using Map::kOrder;
     58   using Map::rows_;
     59   using Map::stride_;
     60 
     61  public:
     62   Matrix() : Map(nullptr, 0, 0, 0) {}
     63 
     64   Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
     65 
     66   Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
     67 
     68   Matrix& operator=(const Matrix& other) {
     69     Resize(other.rows_, other.cols_);
     70     std::memcpy(data_, other.data_, size() * sizeof(Scalar));
     71     return *this;
     72   }
     73 
     74   friend bool operator==(const Matrix& a, const Matrix& b) {
     75     return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
     76            !std::memcmp(a.data_, b.data_, a.size());
     77   }
     78 
     79   void Resize(int rows, int cols) {
     80     rows_ = rows;
     81     cols_ = cols;
     82     stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
     83     storage.resize(size());
     84     data_ = storage.data();
     85   }
     86 
     87   int size() const { return rows_ * cols_; }
     88 
     89   Map& map() { return *static_cast<Map*>(this); }
     90 
     91   ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
     92 
     93  protected:
     94   std::vector<Scalar> storage;
     95 };
     96 
     97 template <typename MatrixType>
     98 void MakeZero(MatrixType* m) {
     99   for (int c = 0; c < m->cols(); c++) {
    100     for (int r = 0; r < m->rows(); r++) {
    101       (*m)(r, c) = 128;
    102     }
    103   }
    104 }
    105 
    106 }  // end namespace gemmlowp
    107 
    108 template <typename BitDepthParams>
    109 float benchmark_8bit(int rows, int depth, int cols) {
    110   using namespace gemmlowp;
    111   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
    112   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
    113   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
    114 
    115   LhsType lhs;
    116   RhsType rhs;
    117   ResultType result;
    118   lhs.Resize(rows, depth);
    119   rhs.Resize(depth, cols);
    120   result.Resize(rows, cols);
    121   MakeZero(&lhs);
    122   MakeZero(&rhs);
    123   MakeZero(&result);
    124 
    125   typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
    126                      OutputStageSaturatingCastToUint8>
    127       Pipeline;
    128   gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
    129       quantize_down_stage;
    130   quantize_down_stage.result_offset_after_shift = 128;
    131   quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
    132   quantize_down_stage.result_shift = 16;
    133   gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
    134   const auto output_pipeline =
    135       std::make_tuple(quantize_down_stage, saturating_cast_stage);
    136   GemmContext gemm_context;
    137   gemm_context.set_max_num_threads(kNumThreads);
    138   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
    139       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
    140       -128, output_pipeline);
    141 
    142   double time_start = real_time_in_seconds();
    143   double t = time_start;
    144   int iters = 0;
    145   int iters_at_a_time = 1;
    146   while (t - time_start < kBenchmarkSecs) {
    147     for (int i = 0; i < iters_at_a_time; i++) {
    148       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
    149                                        BitDepthParams>(
    150           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
    151           -128, output_pipeline);
    152       iters++;
    153     }
    154     iters_at_a_time *= 2;
    155     t = real_time_in_seconds();
    156   }
    157   return (t - time_start) / iters;
    158 }
    159 
    160 template <typename BitDepthParams>
    161 float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
    162   using namespace gemmlowp;
    163   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
    164   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
    165   typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;
    166 
    167   LhsType lhs;
    168   RhsType rhs;
    169   ResultType result;
    170   lhs.Resize(rows, depth);
    171   rhs.Resize(depth, cols);
    172   result.Resize(rows, cols);
    173   MakeZero(&lhs);
    174   MakeZero(&rhs);
    175   MakeZero(&result);
    176 
    177   typedef std::tuple<> EmptyPipeline;
    178   GemmContext gemm_context;
    179   gemm_context.set_max_num_threads(kNumThreads);
    180   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
    181       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
    182       -128, EmptyPipeline());
    183 
    184   double time_start = real_time_in_seconds();
    185   double t = time_start;
    186   int iters = 0;
    187   int iters_at_a_time = 1;
    188   while (t - time_start < kBenchmarkSecs) {
    189     for (int i = 0; i < iters_at_a_time; i++) {
    190       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
    191                                        BitDepthParams>(
    192           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
    193           -128, EmptyPipeline());
    194       iters++;
    195     }
    196     iters_at_a_time *= 2;
    197     t = real_time_in_seconds();
    198   }
    199   return (t - time_start) / iters;
    200 }
    201 
    202 struct Shape {
    203   int rows;
    204   int depth;
    205   int cols;
    206 };
    207 
    208 bool operator==(const Shape& s1, const Shape& s2) {
    209   return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
    210 }
    211 
    212 bool operator<(const Shape& shape1, const Shape& shape2) {
    213   return shape1.depth < shape2.depth ||
    214          (shape1.depth == shape2.depth &&
    215           (shape1.rows < shape2.rows ||
    216            (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
    217 };
    218 
    219 #ifdef _WIN32
    220 #define sleep(t) Sleep(t)
    221 #endif
    222 
    223 float benchmark(const Shape& shape) {
    224   if (kCooldownBeforeBenchmarkSecs) {
    225     sleep(kCooldownBeforeBenchmarkSecs);
    226   }
    227 #if defined BENCHMARK_8bit
    228   // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
    229   // This is the recommended thing to default to: it's what most applications
    230   // want to use, as it's the fastest.
    231   // The contract is that LHS must take values in [1, 255], while RHS can take
    232   // any value in [0, 255].
    233   return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
    234       shape.rows, shape.depth, shape.cols);
    235 #elif defined BENCHMARK_8bit_wide
    236   // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
    237   // The only contract difference is that both LHS and RHS can take values in
    238   // [0, 255].
    239   return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
    240       shape.rows, shape.depth, shape.cols);
    241 #elif defined BENCHMARK_8bit_to_32bit
    242   // Variant of BENCHMARK_8bit where the user asks for getting raw int32
    243   // accumulators, instead of a 8bit-downscaled result.
    244   return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
    245       shape.rows, shape.depth, shape.cols);
    246 #elif defined BENCHMARK_8bit_to_32bit_wide
    247   // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
    248   // accumulators, instead of a 8bit-downscaled result.
    249   return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
    250       shape.rows, shape.depth, shape.cols);
    251 #elif defined BENCHMARK_float
    252   return benchmark_float(shape.rows, shape.depth, shape.cols);
    253 #else
    254 #error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
    255 #endif
    256 }
    257 
    258 std::set<int> all_sizes() {
    259   std::set<int> sizes;
    260   for (int i = 1; i <= 2048; i *= 2) {
    261     sizes.insert(i);
    262   }
    263   for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
    264     sizes.insert(static_cast<int>(std::round(x)));
    265   }
    266   for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
    267     sizes.insert(static_cast<int>(std::round(x)));
    268   }
    269   return sizes;
    270 }
    271 
    272 std::mt19937& RandomEngine() {
    273   static std::mt19937 engine;
    274   return engine;
    275 }
    276 
    277 std::vector<Shape> all_shapes_in_random_order() {
    278   std::vector<Shape> shapes;
    279   const std::set<int> sizes = all_sizes();
    280 #if defined BENCHMARK_ROWS
    281   // Benchmark one specific shape
    282   Shape shape;
    283   shape.rows = BENCHMARK_ROWS;
    284   shape.depth = BENCHMARK_DEPTH;
    285   shape.cols = BENCHMARK_COLS;
    286   shapes.push_back(shape);
    287 #elif defined BENCHMARK_QUICK
    288   // Benchmark an assortment of cubic shapes
    289   for (int size : sizes) {
    290     Shape shape;
    291     shape.rows = size;
    292     shape.depth = size;
    293     shape.cols = size;
    294     shapes.push_back(shape);
    295   }
    296 #elif defined BENCHMARK_EXHAUSTIVE
    297   // Benchmark all sorts of shapes
    298   for (int rows : sizes) {
    299     for (int depth : sizes) {
    300       for (int cols : sizes) {
    301         Shape shape;
    302         shape.rows = rows;
    303         shape.depth = depth;
    304         shape.cols = cols;
    305         shapes.push_back(shape);
    306       }
    307     }
    308   }
    309 #else
    310 #error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
    311 #endif
    312   std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
    313   return shapes;
    314 }
    315 
    316 void run_benchmarks(std::map<Shape, float>* results) {
    317   std::vector<Shape> shapes;
    318   for (int pass = 0; pass < kPasses; pass++) {
    319     const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
    320     shapes.insert(std::end(shapes), std::begin(pass_shapes),
    321                   std::end(pass_shapes));
    322   }
    323 
    324   const double time_start = gemmlowp::real_time_in_seconds();
    325   for (std::size_t i = 0; i < shapes.size(); i++) {
    326     const double ratio = static_cast<double>(i) / shapes.size();
    327     const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
    328     const double elapsed_hours = elapsed / 3600.;
    329     const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
    330     fprintf(stderr,
    331             "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
    332             "hours...   \r",
    333             100. * ratio, elapsed_hours, eta_hours);
    334     fflush(stderr);
    335     const Shape& shape = shapes[i];
    336     float latency = benchmark(shape);
    337     if (results->count(shape)) {
    338       (*results)[shape] = std::min(latency, (*results)[shape]);
    339     } else {
    340       (*results)[shape] = latency;
    341     }
    342   }
    343   fprintf(stderr, "\n");
    344 }
    345 
    346 int main() {
    347   std::map<Shape, float> results;
    348   run_benchmarks(&results);
    349   printf("Using %d thread(s)\n", kNumThreads);
    350   printf("depth,rows,cols,latency(s),Gop/s\n");
    351   for (const auto& result : results) {
    352     const Shape& shape = result.first;
    353     printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
    354            result.second,
    355            2e-9 * shape.depth * shape.rows * shape.cols / result.second);
    356   }
    357 }
    358