Home | History | Annotate | Download | only in test
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #ifdef __APPLE__
     16 #include <sys/time.h>
     17 #endif
     18 
     19 #include <cstdint>
     20 #include <cstdlib>
     21 #include <ctime>
     22 #include <iostream>
     23 #include <map>
     24 #include <vector>
     25 #ifdef __APPLE__
     26 #include <TargetConditionals.h>
     27 #endif
     28 
     29 #include "test.h"
     30 
     31 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
     32 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
     33 #endif
     34 
     35 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
     36 #warning "Building without NEON support on ARM, check your compiler setup!"
     37 #endif
     38 
     39 #if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4)
     40 #warning \
     41     "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
     42 #endif
     43 
     44 namespace gemmlowp {
     45 
     46 const double min_accurate_duration = 1e-1;
     47 const std::size_t min_working_set_size = 16 * 1024 * 1024;
     48 
     49 struct gemm_t {
     50   int rows, depth, cols;
     51   gemm_t() : rows(0), depth(0), cols(0) {}
     52   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
     53 };
     54 
     55 bool operator<(const gemm_t& a, const gemm_t& b) {
     56   return a.rows < b.rows ||
     57          (a.rows <= b.rows &&
     58           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
     59 }
     60 
     61 template <typename LhsType, typename RhsType, typename ResultType>
     62 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
     63   typedef std::uint8_t Scalar;
     64 
     65   // set up the matrix pool
     66 
     67   std::size_t combined_gemm_sizes = 0;
     68   for (auto gemm : gemms) {
     69     int rows = gemm.rows;
     70     int depth = gemm.depth;
     71     int cols = gemm.cols;
     72     combined_gemm_sizes +=
     73         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
     74   }
     75 
     76   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
     77 
     78   std::vector<LhsType> lhs(pool_size * gemms.size());
     79   std::vector<RhsType> rhs(pool_size * gemms.size());
     80   std::vector<ResultType> result(pool_size * gemms.size());
     81 
     82   for (std::size_t i = 0; i < pool_size; i++) {
     83     for (std::size_t j = 0; j < gemms.size(); j++) {
     84       int k = i * gemms.size() + j;
     85       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
     86       MakeConstant(&lhs[k], 0);
     87       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
     88       MakeConstant(&rhs[k], 0);
     89       result[k].Resize(gemms[j].rows, gemms[j].cols);
     90       MakeConstant(&result[k], 0);
     91     }
     92   }
     93 
     94   // main benchmark loop
     95 
     96   int iters_at_a_time = 1;
     97   float time_per_iter = 0.0f;
     98   std::size_t pool_index = 0;
     99 
    100   while (true) {
    101     double starttime = real_time_in_seconds();
    102     for (int i = 0; i < iters_at_a_time; i++) {
    103       for (size_t j = 0; j < gemms.size(); j++) {
    104         size_t k = pool_index * gemms.size() + j;
    105         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
    106             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
    107             -75, -91, 74980, 123, 20);
    108       }
    109       pool_index++;
    110       if (pool_index == pool_size) {
    111         pool_index = 0;
    112       }
    113     }
    114     double endtime = real_time_in_seconds();
    115 
    116     const float timing = static_cast<float>(endtime - starttime);
    117 
    118     if (timing >= min_accurate_duration) {
    119       time_per_iter = timing / iters_at_a_time;
    120       break;
    121     }
    122 
    123     iters_at_a_time *= 2;
    124   }
    125 
    126   return time_per_iter;
    127 }
    128 
    129 template <typename LhsType, typename RhsType, typename ResultType>
    130 double gflops_for_gemms(GemmContext* context,
    131                         const std::vector<gemm_t>& gemms) {
    132   const double time_per_iter =
    133       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
    134   double ops = 0;
    135   for (auto gemm : gemms) {
    136     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
    137   }
    138   return 1e-9 * ops / time_per_iter;
    139 }
    140 
    141 void benchmark(GemmContext* context) {
    142   std::map<gemm_t, std::vector<double>> benchmark_results;
    143 
    144   std::vector<gemm_t> benchmark_gemms;
    145   benchmark_gemms.emplace_back(10, 10, 10);
    146   benchmark_gemms.emplace_back(20, 20, 20);
    147   benchmark_gemms.emplace_back(30, 30, 30);
    148   benchmark_gemms.emplace_back(40, 40, 40);
    149   benchmark_gemms.emplace_back(50, 50, 50);
    150   benchmark_gemms.emplace_back(60, 60, 60);
    151   benchmark_gemms.emplace_back(64, 256, 147);
    152   benchmark_gemms.emplace_back(100, 100, 1);
    153   benchmark_gemms.emplace_back(100, 100, 100);
    154   benchmark_gemms.emplace_back(100, 1000, 100);
    155   benchmark_gemms.emplace_back(1000, 1000, 1);
    156   benchmark_gemms.emplace_back(1000, 1000, 10);
    157   benchmark_gemms.emplace_back(1000, 1000, 100);
    158   benchmark_gemms.emplace_back(1000, 1000, 1000);
    159 
    160   const int repeat = 2;
    161 
    162   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
    163   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
    164   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
    165 
    166 #ifdef GEMMLOWP_TEST_PROFILE
    167   gemmlowp::RegisterCurrentThreadForProfiling();
    168   gemmlowp::StartProfiling();
    169 #endif
    170 
    171   // We don't record the first repetition, it's just warm-up.
    172   for (int r = 0; r < repeat + 1; r++) {
    173     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
    174               << std::flush;
    175     for (auto gemm : benchmark_gemms) {
    176       double gflops = 0;
    177       std::vector<gemm_t> unique_gemm;
    178       unique_gemm.push_back(gemm);
    179       gflops =
    180           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
    181       if (r > 0) {
    182         benchmark_results[gemm].emplace_back(gflops);
    183       }
    184     }
    185   }
    186 
    187 #ifdef GEMMLOWP_TEST_PROFILE
    188   gemmlowp::FinishProfiling();
    189 #endif
    190 
    191   std::cout << "                                                \r"
    192             << std::flush;
    193 
    194   std::cout.precision(4);
    195 
    196   for (auto b : benchmark_results) {
    197     sort(b.second.begin(), b.second.end());
    198     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
    199               << " : " << b.second.back() << " GFlops/s" << std::endl;
    200   }
    201   std::cout << std::endl;
    202 }
    203 
    204 void benchmark_gemm_sizes(GemmContext* context,
    205                           const std::vector<gemm_t>& gemms, double mintime) {
    206   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
    207   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
    208   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
    209 
    210   std::vector<float> gemm_times;
    211   std::cout << "running for " << mintime << " seconds..." << std::endl;
    212 
    213 #ifdef GEMMLOWP_TEST_PROFILE
    214   gemmlowp::RegisterCurrentThreadForProfiling();
    215   gemmlowp::StartProfiling();
    216 #endif
    217 
    218   double starttime = real_time_in_seconds();
    219   while (real_time_in_seconds() < starttime + mintime) {
    220     gemm_times.push_back(
    221         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
    222   }
    223 
    224 #ifdef GEMMLOWP_TEST_PROFILE
    225   gemmlowp::FinishProfiling();
    226 #endif
    227 
    228   std::sort(gemm_times.begin(), gemm_times.end());
    229 
    230   double sum_gemm_times = 0;
    231   double sum_gemm_times_trimmed = 0;
    232   int count_gemm_times_trimmed = 0;
    233   const float trim_ratio = 0.25;
    234   const size_t count_trimmed = gemm_times.size() * trim_ratio;
    235   double sum_gemm_times_best = 0;
    236   int count_gemm_times_best = 0;
    237   const float best_ratio = 0.1;
    238   const size_t count_best = gemm_times.size() * best_ratio;
    239 
    240   for (size_t i = 0; i < gemm_times.size(); i++) {
    241     sum_gemm_times += gemm_times[i];
    242     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
    243       sum_gemm_times_trimmed += gemm_times[i];
    244       count_gemm_times_trimmed++;
    245     }
    246     if (i < count_best) {
    247       sum_gemm_times_best += gemm_times[i];
    248       count_gemm_times_best++;
    249     }
    250   }
    251 
    252   const double min_latency = gemm_times.front();
    253   const double max_latency = gemm_times.back();
    254   const double mean_latency = sum_gemm_times / gemm_times.size();
    255   const double trimmed_mean_latency =
    256       sum_gemm_times_trimmed / count_gemm_times_trimmed;
    257   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
    258 
    259   std::cout << "Graph latency (over " << gemm_times.size()
    260             << " iterations):" << std::endl;
    261   std::cout << "  Best:             " << min_latency << "s" << std::endl;
    262   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
    263   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
    264   std::cout << "  " << 100 * trim_ratio
    265             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
    266   std::cout << "  Mean of " << 100 * best_ratio
    267             << "% best: " << best_mean_latency << "s" << std::endl;
    268 }
    269 
    270 void benchmark_googlenet(GemmContext* context) {
    271   // These are the m, n, k sizes for a typical GoogLeNet.
    272   const int googlenet_gemm_sizes[] = {
    273       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
    274       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
    275       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
    276       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
    277       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
    278       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
    279       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
    280       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
    281       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
    282       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
    283       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
    284       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
    285       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
    286       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
    287       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
    288       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
    289   };
    290   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
    291          0);
    292   const std::size_t num_googlenet_gemms =
    293       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
    294 
    295   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
    296   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
    297     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
    298     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
    299     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
    300   }
    301 
    302   const double mintime = 20.0;
    303   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
    304 }
    305 
    306 void benchmark_small_model(GemmContext* context) {
    307   // These are the m, n, k sizes for a small model with large batches.
    308   const int small_model_gemm_sizes[] = {
    309       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
    310   };
    311   assert(sizeof(small_model_gemm_sizes) %
    312              (3 * sizeof(small_model_gemm_sizes[0])) ==
    313          0);
    314   const std::size_t num_small_model_gemms =
    315       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
    316 
    317   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
    318   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
    319     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
    320     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
    321     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
    322   }
    323 
    324   const double mintime = 10.0;
    325   benchmark_gemm_sizes(context, small_model_gemms, mintime);
    326 }
    327 
    328 void benchmark_all() {
    329   {
    330     gemmlowp::GemmContext context;
    331     std::cout << "Benchmarking small model GEMMs..." << std::endl;
    332     gemmlowp::benchmark_small_model(&context);
    333   }
    334 
    335   {
    336     gemmlowp::GemmContext context;
    337     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
    338     gemmlowp::benchmark_googlenet(&context);
    339   }
    340 
    341   {
    342     gemmlowp::GemmContext context;
    343     context.set_max_num_threads(0);
    344     std::cout << "Benchmarking multi-threaded mode..." << std::endl;
    345     gemmlowp::benchmark(&context);
    346   }
    347 
    348   {
    349     gemmlowp::GemmContext context;
    350     context.set_max_num_threads(1);
    351     std::cout << "Benchmarking single-threaded mode..." << std::endl;
    352     gemmlowp::benchmark(&context);
    353   }
    354 }
    355 
    356 }  // end namespace gemmlowp
    357 
    358 // For iOS, we need to define our own main(), so skip it here.
    359 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
    360 int main() { gemmlowp::benchmark_all(); }
    361 #endif
    362