Home | History | Annotate | Download | only in test
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include <unistd.h>
     16 #ifdef __APPLE__
     17 #include <sys/time.h>
     18 #endif
     19 
     20 #include <cstdint>
     21 #include <cstdlib>
     22 #include <ctime>
     23 #include <iostream>
     24 #include <map>
     25 #include <vector>
     26 #ifdef __APPLE__
     27 #include <TargetConditionals.h>
     28 #endif
     29 
     30 #include "test.h"
     31 
     32 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
     33 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
     34 #endif
     35 
     36 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
     37 #warning "Building without NEON support on ARM, check your compiler setup!"
     38 #endif
     39 
     40 namespace gemmlowp {
     41 
     42 double time() {
     43 #ifdef __APPLE__
     44   timeval t;
     45   gettimeofday(&t, nullptr);
     46   return t.tv_sec + 1e-6 * t.tv_usec;
     47 #else
     48   timespec t;
     49   clock_gettime(CLOCK_REALTIME, &t);
     50   return t.tv_sec + 1e-9 * t.tv_nsec;
     51 #endif
     52 }
     53 
     54 const double min_accurate_duration = 1e-1;
     55 const std::size_t min_working_set_size = 16 * 1024 * 1024;
     56 
     57 struct gemm_t {
     58   int rows, depth, cols;
     59   gemm_t() : rows(0), depth(0), cols(0) {}
     60   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
     61 };
     62 
     63 bool operator<(const gemm_t& a, const gemm_t& b) {
     64   return a.rows < b.rows ||
     65          (a.rows <= b.rows &&
     66           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
     67 }
     68 
     69 template <typename LhsType, typename RhsType, typename ResultType>
     70 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
     71   typedef std::uint8_t Scalar;
     72 
     73   // set up the matrix pool
     74 
     75   std::size_t combined_gemm_sizes = 0;
     76   for (auto gemm : gemms) {
     77     int rows = gemm.rows;
     78     int depth = gemm.depth;
     79     int cols = gemm.cols;
     80     combined_gemm_sizes +=
     81         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
     82   }
     83 
     84   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
     85 
     86   std::vector<LhsType> lhs(pool_size * gemms.size());
     87   std::vector<RhsType> rhs(pool_size * gemms.size());
     88   std::vector<ResultType> result(pool_size * gemms.size());
     89 
     90   for (std::size_t i = 0; i < pool_size; i++) {
     91     for (std::size_t j = 0; j < gemms.size(); j++) {
     92       int k = i * gemms.size() + j;
     93       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
     94       MakeConstant(&lhs[k], 0);
     95       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
     96       MakeConstant(&rhs[k], 0);
     97       result[k].Resize(gemms[j].rows, gemms[j].cols);
     98       MakeConstant(&result[k], 0);
     99     }
    100   }
    101 
    102   // main benchmark loop
    103 
    104   int iters_at_a_time = 1;
    105   float time_per_iter = 0.0f;
    106   std::size_t pool_index = 0;
    107 
    108   while (true) {
    109     double starttime = time();
    110     for (int i = 0; i < iters_at_a_time; i++) {
    111       for (size_t j = 0; j < gemms.size(); j++) {
    112         int k = pool_index * gemms.size() + j;
    113         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
    114             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
    115             -75, -91, 74980, 123, 20);
    116       }
    117       pool_index++;
    118       if (pool_index == pool_size) {
    119         pool_index = 0;
    120       }
    121     }
    122     double endtime = time();
    123 
    124     const float timing = static_cast<float>(endtime - starttime);
    125 
    126     if (timing >= min_accurate_duration) {
    127       time_per_iter = timing / iters_at_a_time;
    128       break;
    129     }
    130 
    131     iters_at_a_time *= 2;
    132   }
    133 
    134   return time_per_iter;
    135 }
    136 
    137 template <typename LhsType, typename RhsType, typename ResultType>
    138 double gflops_for_gemms(GemmContext* context,
    139                         const std::vector<gemm_t>& gemms) {
    140   const double time_per_iter =
    141       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
    142   double ops = 0;
    143   for (auto gemm : gemms) {
    144     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
    145   }
    146   return 1e-9 * ops / time_per_iter;
    147 }
    148 
    149 void benchmark(GemmContext* context) {
    150   std::map<gemm_t, std::vector<double>> benchmark_results;
    151 
    152   std::vector<gemm_t> benchmark_gemms;
    153   benchmark_gemms.emplace_back(10, 10, 10);
    154   benchmark_gemms.emplace_back(20, 20, 20);
    155   benchmark_gemms.emplace_back(30, 30, 30);
    156   benchmark_gemms.emplace_back(40, 40, 40);
    157   benchmark_gemms.emplace_back(50, 50, 50);
    158   benchmark_gemms.emplace_back(60, 60, 60);
    159   benchmark_gemms.emplace_back(64, 256, 147);
    160   benchmark_gemms.emplace_back(100, 100, 1);
    161   benchmark_gemms.emplace_back(100, 100, 100);
    162   benchmark_gemms.emplace_back(100, 1000, 100);
    163   benchmark_gemms.emplace_back(1000, 1000, 1);
    164   benchmark_gemms.emplace_back(1000, 1000, 10);
    165   benchmark_gemms.emplace_back(1000, 1000, 100);
    166   benchmark_gemms.emplace_back(1000, 1000, 1000);
    167 
    168   const int repeat = 2;
    169 
    170   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
    171   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
    172   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
    173 
    174 #ifdef GEMMLOWP_TEST_PROFILE
    175   gemmlowp::RegisterCurrentThreadForProfiling();
    176   gemmlowp::StartProfiling();
    177 #endif
    178 
    179   // We don't record the first repetition, it's just warm-up.
    180   for (int r = 0; r < repeat + 1; r++) {
    181     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
    182               << std::flush;
    183     for (auto gemm : benchmark_gemms) {
    184       double gflops = 0;
    185       std::vector<gemm_t> unique_gemm;
    186       unique_gemm.push_back(gemm);
    187       gflops =
    188           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
    189       if (r > 0) {
    190         benchmark_results[gemm].emplace_back(gflops);
    191       }
    192     }
    193   }
    194 
    195 #ifdef GEMMLOWP_TEST_PROFILE
    196   gemmlowp::FinishProfiling();
    197 #endif
    198 
    199   std::cout << "                                                \r"
    200             << std::flush;
    201 
    202   std::cout.precision(4);
    203 
    204   for (auto b : benchmark_results) {
    205     sort(b.second.begin(), b.second.end());
    206     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
    207               << " : " << b.second.back() << " GFlops/s" << std::endl;
    208   }
    209   std::cout << std::endl;
    210 }
    211 
    212 void benchmark_gemm_sizes(GemmContext* context,
    213                           const std::vector<gemm_t>& gemms, double mintime) {
    214   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
    215   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
    216   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
    217 
    218   std::vector<float> gemm_times;
    219   std::cout << "running for " << mintime << " seconds..." << std::endl;
    220 
    221 #ifdef GEMMLOWP_TEST_PROFILE
    222   gemmlowp::RegisterCurrentThreadForProfiling();
    223   gemmlowp::StartProfiling();
    224 #endif
    225 
    226   double starttime = time();
    227   while (time() < starttime + mintime) {
    228     gemm_times.push_back(
    229         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
    230   }
    231 
    232 #ifdef GEMMLOWP_TEST_PROFILE
    233   gemmlowp::FinishProfiling();
    234 #endif
    235 
    236   std::sort(gemm_times.begin(), gemm_times.end());
    237 
    238   double sum_gemm_times = 0;
    239   double sum_gemm_times_trimmed = 0;
    240   int count_gemm_times_trimmed = 0;
    241   const float trim_ratio = 0.25;
    242   const size_t count_trimmed = gemm_times.size() * trim_ratio;
    243   double sum_gemm_times_best = 0;
    244   int count_gemm_times_best = 0;
    245   const float best_ratio = 0.1;
    246   const size_t count_best = gemm_times.size() * best_ratio;
    247 
    248   for (size_t i = 0; i < gemm_times.size(); i++) {
    249     sum_gemm_times += gemm_times[i];
    250     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
    251       sum_gemm_times_trimmed += gemm_times[i];
    252       count_gemm_times_trimmed++;
    253     }
    254     if (i < count_best) {
    255       sum_gemm_times_best += gemm_times[i];
    256       count_gemm_times_best++;
    257     }
    258   }
    259 
    260   const double min_latency = gemm_times.front();
    261   const double max_latency = gemm_times.back();
    262   const double mean_latency = sum_gemm_times / gemm_times.size();
    263   const double trimmed_mean_latency =
    264       sum_gemm_times_trimmed / count_gemm_times_trimmed;
    265   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
    266 
    267   std::cout << "Graph latency (over " << gemm_times.size()
    268             << " iterations):" << std::endl;
    269   std::cout << "  Best:             " << min_latency << "s" << std::endl;
    270   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
    271   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
    272   std::cout << "  " << 100 * trim_ratio
    273             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
    274   std::cout << "  Mean of " << 100 * best_ratio
    275             << "% best: " << best_mean_latency << "s" << std::endl;
    276 }
    277 
    278 void benchmark_googlenet(GemmContext* context) {
    279   // These are the m, n, k sizes for a typical GoogLeNet.
    280   const int googlenet_gemm_sizes[] = {
    281       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
    282       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
    283       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
    284       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
    285       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
    286       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
    287       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
    288       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
    289       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
    290       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
    291       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
    292       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
    293       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
    294       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
    295       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
    296       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
    297   };
    298   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
    299          0);
    300   const std::size_t num_googlenet_gemms =
    301       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
    302 
    303   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
    304   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
    305     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
    306     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
    307     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
    308   }
    309 
    310   const double mintime = 20.0;
    311   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
    312 }
    313 
    314 void benchmark_small_model(GemmContext* context) {
    315   // These are the m, n, k sizes for a small model with large batches.
    316   const int small_model_gemm_sizes[] = {
    317       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
    318   };
    319   assert(sizeof(small_model_gemm_sizes) %
    320              (3 * sizeof(small_model_gemm_sizes[0])) ==
    321          0);
    322   const std::size_t num_small_model_gemms =
    323       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
    324 
    325   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
    326   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
    327     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
    328     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
    329     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
    330   }
    331 
    332   const double mintime = 10.0;
    333   benchmark_gemm_sizes(context, small_model_gemms, mintime);
    334 }
    335 
    336 void benchmark_all() {
    337   {
    338     gemmlowp::GemmContext context;
    339     std::cout << "Benchmarking small model GEMMs..." << std::endl;
    340     gemmlowp::benchmark_small_model(&context);
    341   }
    342 
    343   {
    344     gemmlowp::GemmContext context;
    345     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
    346     gemmlowp::benchmark_googlenet(&context);
    347   }
    348 
    349   {
    350     gemmlowp::GemmContext context;
    351     std::cout << "Benchmarking default mode (typically multi-threaded)..."
    352               << std::endl;
    353     gemmlowp::benchmark(&context);
    354   }
    355 
    356   {
    357     gemmlowp::GemmContext context;
    358     context.set_max_num_threads(1);
    359     std::cout << "Benchmarking single-threaded mode..." << std::endl;
    360     gemmlowp::benchmark(&context);
    361   }
    362 }
    363 
    364 }  // end namespace gemmlowp
    365 
    366 // For iOS, we need to define our own main(), so skip it here.
    367 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
    368 int main() { gemmlowp::benchmark_all(); }
    369 #endif
    370