1 // Example command line to build on Android ARM64: 2 /* 3 ~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \ 4 test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \ 5 -DBENCHMARK_QUICK -DBENCHMARK_8bit 6 */ 7 8 #include <algorithm> 9 #include <cmath> 10 #include <cstdint> 11 #include <ctime> 12 #include <iostream> 13 #include <map> 14 #include <random> 15 #include <set> 16 17 #include "../public/gemmlowp.h" 18 19 #if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32 20 // Compilation workaround 21 namespace std { 22 using ::round; 23 } 24 #endif 25 26 // Minimum duration of each benchmark measurement. Also, duration 27 // of sleep time between each two consecutive benchmark measurements to 28 // prevent over-heating. 29 const double kBenchmarkSecs = 0.1; 30 31 // Sleep time before each benchmark. 32 const int kCooldownBeforeBenchmarkSecs = 0; 33 34 // Number of benchmark passes. 35 const int kPasses = 4; 36 37 #ifdef BENCHMARK_NUM_THREADS 38 const int kNumThreads = BENCHMARK_NUM_THREADS; 39 #else 40 const int kNumThreads = 1; 41 #endif 42 43 namespace gemmlowp { 44 45 // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, 46 // since it only maps existing data. In tests though, we need to 47 // create our own matrices. 48 template <typename tScalar, MapOrder tOrder> 49 class Matrix : public MatrixMap<tScalar, tOrder> { 50 public: 51 typedef MatrixMap<tScalar, tOrder> Map; 52 typedef MatrixMap<const tScalar, tOrder> ConstMap; 53 typedef typename Map::Scalar Scalar; 54 static const MapOrder Order = tOrder; 55 using Map::cols_; 56 using Map::data_; 57 using Map::kOrder; 58 using Map::rows_; 59 using Map::stride_; 60 61 public: 62 Matrix() : Map(nullptr, 0, 0, 0) {} 63 64 Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } 65 66 Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; } 67 68 Matrix& operator=(const Matrix& other) { 69 Resize(other.rows_, other.cols_); 70 std::memcpy(data_, other.data_, size() * sizeof(Scalar)); 71 return *this; 72 } 73 74 friend bool operator==(const Matrix& a, const Matrix& b) { 75 return a.rows_ == b.rows_ && a.cols_ == b.cols_ && 76 !std::memcmp(a.data_, b.data_, a.size()); 77 } 78 79 void Resize(int rows, int cols) { 80 rows_ = rows; 81 cols_ = cols; 82 stride_ = kOrder == MapOrder::ColMajor ? rows : cols; 83 storage.resize(size()); 84 data_ = storage.data(); 85 } 86 87 int size() const { return rows_ * cols_; } 88 89 Map& map() { return *static_cast<Map*>(this); } 90 91 ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } 92 93 protected: 94 std::vector<Scalar> storage; 95 }; 96 97 template <typename MatrixType> 98 void MakeZero(MatrixType* m) { 99 for (int c = 0; c < m->cols(); c++) { 100 for (int r = 0; r < m->rows(); r++) { 101 (*m)(r, c) = 128; 102 } 103 } 104 } 105 106 } // end namespace gemmlowp 107 108 template <typename BitDepthParams> 109 float benchmark_8bit(int rows, int depth, int cols) { 110 using namespace gemmlowp; 111 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; 112 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; 113 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; 114 115 LhsType lhs; 116 RhsType rhs; 117 ResultType result; 118 lhs.Resize(rows, depth); 119 rhs.Resize(depth, cols); 120 result.Resize(rows, cols); 121 MakeZero(&lhs); 122 MakeZero(&rhs); 123 MakeZero(&result); 124 125 typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, 126 OutputStageSaturatingCastToUint8> 127 Pipeline; 128 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint 129 quantize_down_stage; 130 quantize_down_stage.result_offset_after_shift = 128; 131 quantize_down_stage.result_fixedpoint_multiplier = 1234567890; 132 quantize_down_stage.result_shift = 16; 133 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; 134 const auto output_pipeline = 135 std::make_tuple(quantize_down_stage, saturating_cast_stage); 136 GemmContext gemm_context; 137 gemm_context.set_max_num_threads(kNumThreads); 138 gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>( 139 &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, 140 -128, output_pipeline); 141 142 double time_start = real_time_in_seconds(); 143 double t = time_start; 144 int iters = 0; 145 int iters_at_a_time = 1; 146 while (t - time_start < kBenchmarkSecs) { 147 for (int i = 0; i < iters_at_a_time; i++) { 148 gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, 149 BitDepthParams>( 150 &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, 151 -128, output_pipeline); 152 iters++; 153 } 154 iters_at_a_time *= 2; 155 t = real_time_in_seconds(); 156 } 157 return (t - time_start) / iters; 158 } 159 160 template <typename BitDepthParams> 161 float benchmark_8bit_to_32bit(int rows, int depth, int cols) { 162 using namespace gemmlowp; 163 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; 164 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; 165 typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType; 166 167 LhsType lhs; 168 RhsType rhs; 169 ResultType result; 170 lhs.Resize(rows, depth); 171 rhs.Resize(depth, cols); 172 result.Resize(rows, cols); 173 MakeZero(&lhs); 174 MakeZero(&rhs); 175 MakeZero(&result); 176 177 typedef std::tuple<> EmptyPipeline; 178 GemmContext gemm_context; 179 gemm_context.set_max_num_threads(kNumThreads); 180 gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>( 181 &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, 182 -128, EmptyPipeline()); 183 184 double time_start = real_time_in_seconds(); 185 double t = time_start; 186 int iters = 0; 187 int iters_at_a_time = 1; 188 while (t - time_start < kBenchmarkSecs) { 189 for (int i = 0; i < iters_at_a_time; i++) { 190 gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, 191 BitDepthParams>( 192 &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, 193 -128, EmptyPipeline()); 194 iters++; 195 } 196 iters_at_a_time *= 2; 197 t = real_time_in_seconds(); 198 } 199 return (t - time_start) / iters; 200 } 201 202 struct Shape { 203 int rows; 204 int depth; 205 int cols; 206 }; 207 208 bool operator==(const Shape& s1, const Shape& s2) { 209 return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols; 210 } 211 212 bool operator<(const Shape& shape1, const Shape& shape2) { 213 return shape1.depth < shape2.depth || 214 (shape1.depth == shape2.depth && 215 (shape1.rows < shape2.rows || 216 (shape1.rows == shape2.rows && shape1.cols < shape2.cols))); 217 }; 218 219 #ifdef _WIN32 220 #define sleep(t) Sleep(t) 221 #endif 222 223 float benchmark(const Shape& shape) { 224 if (kCooldownBeforeBenchmarkSecs) { 225 sleep(kCooldownBeforeBenchmarkSecs); 226 } 227 #if defined BENCHMARK_8bit 228 // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams. 229 // This is the recommended thing to default to: it's what most applications 230 // want to use, as it's the fastest. 231 // The contract is that LHS must take values in [1, 255], while RHS can take 232 // any value in [0, 255]. 233 return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( 234 shape.rows, shape.depth, shape.cols); 235 #elif defined BENCHMARK_8bit_wide 236 // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams. 237 // The only contract difference is that both LHS and RHS can take values in 238 // [0, 255]. 239 return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>( 240 shape.rows, shape.depth, shape.cols); 241 #elif defined BENCHMARK_8bit_to_32bit 242 // Variant of BENCHMARK_8bit where the user asks for getting raw int32 243 // accumulators, instead of a 8bit-downscaled result. 244 return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( 245 shape.rows, shape.depth, shape.cols); 246 #elif defined BENCHMARK_8bit_to_32bit_wide 247 // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32 248 // accumulators, instead of a 8bit-downscaled result. 249 return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>( 250 shape.rows, shape.depth, shape.cols); 251 #elif defined BENCHMARK_float 252 return benchmark_float(shape.rows, shape.depth, shape.cols); 253 #else 254 #error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit) 255 #endif 256 } 257 258 std::set<int> all_sizes() { 259 std::set<int> sizes; 260 for (int i = 1; i <= 2048; i *= 2) { 261 sizes.insert(i); 262 } 263 for (double x = 8; x <= 2048; x *= std::sqrt(2.)) { 264 sizes.insert(static_cast<int>(std::round(x))); 265 } 266 for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) { 267 sizes.insert(static_cast<int>(std::round(x))); 268 } 269 return sizes; 270 } 271 272 std::mt19937& RandomEngine() { 273 static std::mt19937 engine; 274 return engine; 275 } 276 277 std::vector<Shape> all_shapes_in_random_order() { 278 std::vector<Shape> shapes; 279 const std::set<int> sizes = all_sizes(); 280 #if defined BENCHMARK_ROWS 281 // Benchmark one specific shape 282 Shape shape; 283 shape.rows = BENCHMARK_ROWS; 284 shape.depth = BENCHMARK_DEPTH; 285 shape.cols = BENCHMARK_COLS; 286 shapes.push_back(shape); 287 #elif defined BENCHMARK_QUICK 288 // Benchmark an assortment of cubic shapes 289 for (int size : sizes) { 290 Shape shape; 291 shape.rows = size; 292 shape.depth = size; 293 shape.cols = size; 294 shapes.push_back(shape); 295 } 296 #elif defined BENCHMARK_EXHAUSTIVE 297 // Benchmark all sorts of shapes 298 for (int rows : sizes) { 299 for (int depth : sizes) { 300 for (int cols : sizes) { 301 Shape shape; 302 shape.rows = rows; 303 shape.depth = depth; 304 shape.cols = cols; 305 shapes.push_back(shape); 306 } 307 } 308 } 309 #else 310 #error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK) 311 #endif 312 std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine()); 313 return shapes; 314 } 315 316 void run_benchmarks(std::map<Shape, float>* results) { 317 std::vector<Shape> shapes; 318 for (int pass = 0; pass < kPasses; pass++) { 319 const std::vector<Shape> pass_shapes = all_shapes_in_random_order(); 320 shapes.insert(std::end(shapes), std::begin(pass_shapes), 321 std::end(pass_shapes)); 322 } 323 324 const double time_start = gemmlowp::real_time_in_seconds(); 325 for (std::size_t i = 0; i < shapes.size(); i++) { 326 const double ratio = static_cast<double>(i) / shapes.size(); 327 const double elapsed = gemmlowp::real_time_in_seconds() - time_start; 328 const double elapsed_hours = elapsed / 3600.; 329 const double eta_hours = elapsed_hours * (1. - ratio) / ratio; 330 fprintf(stderr, 331 "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f " 332 "hours... \r", 333 100. * ratio, elapsed_hours, eta_hours); 334 fflush(stderr); 335 const Shape& shape = shapes[i]; 336 float latency = benchmark(shape); 337 if (results->count(shape)) { 338 (*results)[shape] = std::min(latency, (*results)[shape]); 339 } else { 340 (*results)[shape] = latency; 341 } 342 } 343 fprintf(stderr, "\n"); 344 } 345 346 int main() { 347 std::map<Shape, float> results; 348 run_benchmarks(&results); 349 printf("Using %d thread(s)\n", kNumThreads); 350 printf("depth,rows,cols,latency(s),Gop/s\n"); 351 for (const auto& result : results) { 352 const Shape& shape = result.first; 353 printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols, 354 result.second, 355 2e-9 * shape.depth * shape.rows * shape.cols / result.second); 356 } 357 } 358