1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <unistd.h> 16 #ifdef __APPLE__ 17 #include <sys/time.h> 18 #endif 19 20 #include <cstdint> 21 #include <cstdlib> 22 #include <ctime> 23 #include <iostream> 24 #include <map> 25 #include <vector> 26 #ifdef __APPLE__ 27 #include <TargetConditionals.h> 28 #endif 29 30 #include "test.h" 31 32 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS 33 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams 34 #endif 35 36 #if defined(__arm__) && !defined(GEMMLOWP_NEON) 37 #warning "Building without NEON support on ARM, check your compiler setup!" 38 #endif 39 40 namespace gemmlowp { 41 42 double time() { 43 #ifdef __APPLE__ 44 timeval t; 45 gettimeofday(&t, nullptr); 46 return t.tv_sec + 1e-6 * t.tv_usec; 47 #else 48 timespec t; 49 clock_gettime(CLOCK_REALTIME, &t); 50 return t.tv_sec + 1e-9 * t.tv_nsec; 51 #endif 52 } 53 54 const double min_accurate_duration = 1e-1; 55 const std::size_t min_working_set_size = 16 * 1024 * 1024; 56 57 struct gemm_t { 58 int rows, depth, cols; 59 gemm_t() : rows(0), depth(0), cols(0) {} 60 gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {} 61 }; 62 63 bool operator<(const gemm_t& a, const gemm_t& b) { 64 return a.rows < b.rows || 65 (a.rows <= b.rows && 66 (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols)))); 67 } 68 69 template <typename LhsType, typename RhsType, typename ResultType> 70 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) { 71 typedef std::uint8_t Scalar; 72 73 // set up the matrix pool 74 75 std::size_t combined_gemm_sizes = 0; 76 for (auto gemm : gemms) { 77 int rows = gemm.rows; 78 int depth = gemm.depth; 79 int cols = gemm.cols; 80 combined_gemm_sizes += 81 sizeof(Scalar) * (rows * depth + depth * cols + rows * cols); 82 } 83 84 const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes; 85 86 std::vector<LhsType> lhs(pool_size * gemms.size()); 87 std::vector<RhsType> rhs(pool_size * gemms.size()); 88 std::vector<ResultType> result(pool_size * gemms.size()); 89 90 for (std::size_t i = 0; i < pool_size; i++) { 91 for (std::size_t j = 0; j < gemms.size(); j++) { 92 int k = i * gemms.size() + j; 93 lhs[k].Resize(gemms[j].rows, gemms[j].depth); 94 MakeConstant(&lhs[k], 0); 95 rhs[k].Resize(gemms[j].depth, gemms[j].cols); 96 MakeConstant(&rhs[k], 0); 97 result[k].Resize(gemms[j].rows, gemms[j].cols); 98 MakeConstant(&result[k], 0); 99 } 100 } 101 102 // main benchmark loop 103 104 int iters_at_a_time = 1; 105 float time_per_iter = 0.0f; 106 std::size_t pool_index = 0; 107 108 while (true) { 109 double starttime = time(); 110 for (int i = 0; i < iters_at_a_time; i++) { 111 for (size_t j = 0; j < gemms.size(); j++) { 112 int k = pool_index * gemms.size() + j; 113 Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>( 114 context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(), 115 -75, -91, 74980, 123, 20); 116 } 117 pool_index++; 118 if (pool_index == pool_size) { 119 pool_index = 0; 120 } 121 } 122 double endtime = time(); 123 124 const float timing = static_cast<float>(endtime - starttime); 125 126 if (timing >= min_accurate_duration) { 127 time_per_iter = timing / iters_at_a_time; 128 break; 129 } 130 131 iters_at_a_time *= 2; 132 } 133 134 return time_per_iter; 135 } 136 137 template <typename LhsType, typename RhsType, typename ResultType> 138 double gflops_for_gemms(GemmContext* context, 139 const std::vector<gemm_t>& gemms) { 140 const double time_per_iter = 141 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms); 142 double ops = 0; 143 for (auto gemm : gemms) { 144 ops += 2.0 * gemm.rows * gemm.depth * gemm.cols; 145 } 146 return 1e-9 * ops / time_per_iter; 147 } 148 149 void benchmark(GemmContext* context) { 150 std::map<gemm_t, std::vector<double>> benchmark_results; 151 152 std::vector<gemm_t> benchmark_gemms; 153 benchmark_gemms.emplace_back(10, 10, 10); 154 benchmark_gemms.emplace_back(20, 20, 20); 155 benchmark_gemms.emplace_back(30, 30, 30); 156 benchmark_gemms.emplace_back(40, 40, 40); 157 benchmark_gemms.emplace_back(50, 50, 50); 158 benchmark_gemms.emplace_back(60, 60, 60); 159 benchmark_gemms.emplace_back(64, 256, 147); 160 benchmark_gemms.emplace_back(100, 100, 1); 161 benchmark_gemms.emplace_back(100, 100, 100); 162 benchmark_gemms.emplace_back(100, 1000, 100); 163 benchmark_gemms.emplace_back(1000, 1000, 1); 164 benchmark_gemms.emplace_back(1000, 1000, 10); 165 benchmark_gemms.emplace_back(1000, 1000, 100); 166 benchmark_gemms.emplace_back(1000, 1000, 1000); 167 168 const int repeat = 2; 169 170 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; 171 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; 172 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; 173 174 #ifdef GEMMLOWP_TEST_PROFILE 175 gemmlowp::RegisterCurrentThreadForProfiling(); 176 gemmlowp::StartProfiling(); 177 #endif 178 179 // We don't record the first repetition, it's just warm-up. 180 for (int r = 0; r < repeat + 1; r++) { 181 std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r" 182 << std::flush; 183 for (auto gemm : benchmark_gemms) { 184 double gflops = 0; 185 std::vector<gemm_t> unique_gemm; 186 unique_gemm.push_back(gemm); 187 gflops = 188 gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm); 189 if (r > 0) { 190 benchmark_results[gemm].emplace_back(gflops); 191 } 192 } 193 } 194 195 #ifdef GEMMLOWP_TEST_PROFILE 196 gemmlowp::FinishProfiling(); 197 #endif 198 199 std::cout << " \r" 200 << std::flush; 201 202 std::cout.precision(4); 203 204 for (auto b : benchmark_results) { 205 sort(b.second.begin(), b.second.end()); 206 std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols 207 << " : " << b.second.back() << " GFlops/s" << std::endl; 208 } 209 std::cout << std::endl; 210 } 211 212 void benchmark_gemm_sizes(GemmContext* context, 213 const std::vector<gemm_t>& gemms, double mintime) { 214 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; 215 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; 216 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; 217 218 std::vector<float> gemm_times; 219 std::cout << "running for " << mintime << " seconds..." << std::endl; 220 221 #ifdef GEMMLOWP_TEST_PROFILE 222 gemmlowp::RegisterCurrentThreadForProfiling(); 223 gemmlowp::StartProfiling(); 224 #endif 225 226 double starttime = time(); 227 while (time() < starttime + mintime) { 228 gemm_times.push_back( 229 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms)); 230 } 231 232 #ifdef GEMMLOWP_TEST_PROFILE 233 gemmlowp::FinishProfiling(); 234 #endif 235 236 std::sort(gemm_times.begin(), gemm_times.end()); 237 238 double sum_gemm_times = 0; 239 double sum_gemm_times_trimmed = 0; 240 int count_gemm_times_trimmed = 0; 241 const float trim_ratio = 0.25; 242 const size_t count_trimmed = gemm_times.size() * trim_ratio; 243 double sum_gemm_times_best = 0; 244 int count_gemm_times_best = 0; 245 const float best_ratio = 0.1; 246 const size_t count_best = gemm_times.size() * best_ratio; 247 248 for (size_t i = 0; i < gemm_times.size(); i++) { 249 sum_gemm_times += gemm_times[i]; 250 if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) { 251 sum_gemm_times_trimmed += gemm_times[i]; 252 count_gemm_times_trimmed++; 253 } 254 if (i < count_best) { 255 sum_gemm_times_best += gemm_times[i]; 256 count_gemm_times_best++; 257 } 258 } 259 260 const double min_latency = gemm_times.front(); 261 const double max_latency = gemm_times.back(); 262 const double mean_latency = sum_gemm_times / gemm_times.size(); 263 const double trimmed_mean_latency = 264 sum_gemm_times_trimmed / count_gemm_times_trimmed; 265 const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best; 266 267 std::cout << "Graph latency (over " << gemm_times.size() 268 << " iterations):" << std::endl; 269 std::cout << " Best: " << min_latency << "s" << std::endl; 270 std::cout << " Worst: " << max_latency << "s" << std::endl; 271 std::cout << " Mean: " << mean_latency << "s" << std::endl; 272 std::cout << " " << 100 * trim_ratio 273 << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl; 274 std::cout << " Mean of " << 100 * best_ratio 275 << "% best: " << best_mean_latency << "s" << std::endl; 276 } 277 278 void benchmark_googlenet(GemmContext* context) { 279 // These are the m, n, k sizes for a typical GoogLeNet. 280 const int googlenet_gemm_sizes[] = { 281 12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192, 282 784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400, 283 784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152, 284 784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480, 285 196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400, 286 196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008, 287 196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512, 288 196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600, 289 196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296, 290 196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528, 291 196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800, 292 196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440, 293 49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832, 294 49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200, 295 49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024, 296 16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024, 297 }; 298 assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) == 299 0); 300 const std::size_t num_googlenet_gemms = 301 sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0])); 302 303 std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms); 304 for (std::size_t i = 0; i < num_googlenet_gemms; i++) { 305 googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1]; 306 googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2]; 307 googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0]; 308 } 309 310 const double mintime = 20.0; 311 benchmark_gemm_sizes(context, googlenet_gemms, mintime); 312 } 313 314 void benchmark_small_model(GemmContext* context) { 315 // These are the m, n, k sizes for a small model with large batches. 316 const int small_model_gemm_sizes[] = { 317 29232, 16, 25, 7308, 6, 400, 203, 3002, 216, 318 }; 319 assert(sizeof(small_model_gemm_sizes) % 320 (3 * sizeof(small_model_gemm_sizes[0])) == 321 0); 322 const std::size_t num_small_model_gemms = 323 sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0])); 324 325 std::vector<gemm_t> small_model_gemms(num_small_model_gemms); 326 for (std::size_t i = 0; i < num_small_model_gemms; i++) { 327 small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1]; 328 small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2]; 329 small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0]; 330 } 331 332 const double mintime = 10.0; 333 benchmark_gemm_sizes(context, small_model_gemms, mintime); 334 } 335 336 void benchmark_all() { 337 { 338 gemmlowp::GemmContext context; 339 std::cout << "Benchmarking small model GEMMs..." << std::endl; 340 gemmlowp::benchmark_small_model(&context); 341 } 342 343 { 344 gemmlowp::GemmContext context; 345 std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl; 346 gemmlowp::benchmark_googlenet(&context); 347 } 348 349 { 350 gemmlowp::GemmContext context; 351 std::cout << "Benchmarking default mode (typically multi-threaded)..." 352 << std::endl; 353 gemmlowp::benchmark(&context); 354 } 355 356 { 357 gemmlowp::GemmContext context; 358 context.set_max_num_threads(1); 359 std::cout << "Benchmarking single-threaded mode..." << std::endl; 360 gemmlowp::benchmark(&context); 361 } 362 } 363 364 } // end namespace gemmlowp 365 366 // For iOS, we need to define our own main(), so skip it here. 367 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR)) 368 int main() { gemmlowp::benchmark_all(); } 369 #endif 370