1 // g++ -O3 -DNDEBUG -I.. -L /usr/lib64/atlas/ benchBlasGemm.cpp -o benchBlasGemm -lrt -lcblas 2 // possible options: 3 // -DEIGEN_DONT_VECTORIZE 4 // -msse2 5 6 // #define EIGEN_DEFAULT_TO_ROW_MAJOR 7 #define _FLOAT 8 9 #include <iostream> 10 11 #include <Eigen/Core> 12 #include "BenchTimer.h" 13 14 // include the BLAS headers 15 extern "C" { 16 #include <cblas.h> 17 } 18 #include <string> 19 20 #ifdef _FLOAT 21 typedef float Scalar; 22 #define CBLAS_GEMM cblas_sgemm 23 #else 24 typedef double Scalar; 25 #define CBLAS_GEMM cblas_dgemm 26 #endif 27 28 29 typedef Eigen::Matrix<Scalar,Eigen::Dynamic,Eigen::Dynamic> MyMatrix; 30 void bench_eigengemm(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb, int nbloops); 31 void check_product(int M, int N, int K); 32 void check_product(void); 33 34 int main(int argc, char *argv[]) 35 { 36 // disable SSE exceptions 37 #ifdef __GNUC__ 38 { 39 int aux; 40 asm( 41 "stmxcsr %[aux] \n\t" 42 "orl $32832, %[aux] \n\t" 43 "ldmxcsr %[aux] \n\t" 44 : : [aux] "m" (aux)); 45 } 46 #endif 47 48 int nbtries=1, nbloops=1, M, N, K; 49 50 if (argc==2) 51 { 52 if (std::string(argv[1])=="check") 53 check_product(); 54 else 55 M = N = K = atoi(argv[1]); 56 } 57 else if ((argc==3) && (std::string(argv[1])=="auto")) 58 { 59 M = N = K = atoi(argv[2]); 60 nbloops = 1000000000/(M*M*M); 61 if (nbloops<1) 62 nbloops = 1; 63 nbtries = 6; 64 } 65 else if (argc==4) 66 { 67 M = N = K = atoi(argv[1]); 68 nbloops = atoi(argv[2]); 69 nbtries = atoi(argv[3]); 70 } 71 else if (argc==6) 72 { 73 M = atoi(argv[1]); 74 N = atoi(argv[2]); 75 K = atoi(argv[3]); 76 nbloops = atoi(argv[4]); 77 nbtries = atoi(argv[5]); 78 } 79 else 80 { 81 std::cout << "Usage: " << argv[0] << " size \n"; 82 std::cout << "Usage: " << argv[0] << " auto size\n"; 83 std::cout << "Usage: " << argv[0] << " size nbloops nbtries\n"; 84 std::cout << "Usage: " << argv[0] << " M N K nbloops nbtries\n"; 85 std::cout << "Usage: " << argv[0] << " check\n"; 86 std::cout << "Options:\n"; 87 std::cout << " size unique size of the 2 matrices (integer)\n"; 88 std::cout << " auto automatically set the number of repetitions and tries\n"; 89 std::cout << " nbloops number of times the GEMM routines is executed\n"; 90 std::cout << " nbtries number of times the loop is benched (return the best try)\n"; 91 std::cout << " M N K sizes of the matrices: MxN = MxK * KxN (integers)\n"; 92 std::cout << " check check eigen product using cblas as a reference\n"; 93 exit(1); 94 } 95 96 double nbmad = double(M) * double(N) * double(K) * double(nbloops); 97 98 if (!(std::string(argv[1])=="auto")) 99 std::cout << M << " x " << N << " x " << K << "\n"; 100 101 Scalar alpha, beta; 102 MyMatrix ma(M,K), mb(K,N), mc(M,N); 103 ma = MyMatrix::Random(M,K); 104 mb = MyMatrix::Random(K,N); 105 mc = MyMatrix::Random(M,N); 106 107 Eigen::BenchTimer timer; 108 109 // we simply compute c += a*b, so: 110 alpha = 1; 111 beta = 1; 112 113 // bench cblas 114 // ROWS_A, COLS_B, COLS_A, 1.0, A, COLS_A, B, COLS_B, 0.0, C, COLS_B); 115 if (!(std::string(argv[1])=="auto")) 116 { 117 timer.reset(); 118 for (uint k=0 ; k<nbtries ; ++k) 119 { 120 timer.start(); 121 for (uint j=0 ; j<nbloops ; ++j) 122 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR 123 CBLAS_GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, ma.data(), K, mb.data(), N, beta, mc.data(), N); 124 #else 125 CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, ma.data(), M, mb.data(), K, beta, mc.data(), M); 126 #endif 127 timer.stop(); 128 } 129 if (!(std::string(argv[1])=="auto")) 130 std::cout << "cblas: " << timer.value() << " (" << 1e-3*floor(1e-6*nbmad/timer.value()) << " GFlops/s)\n"; 131 else 132 std::cout << M << " : " << timer.value() << " ; " << 1e-3*floor(1e-6*nbmad/timer.value()) << "\n"; 133 } 134 135 // clear 136 ma = MyMatrix::Random(M,K); 137 mb = MyMatrix::Random(K,N); 138 mc = MyMatrix::Random(M,N); 139 140 // eigen 141 // if (!(std::string(argv[1])=="auto")) 142 { 143 timer.reset(); 144 for (uint k=0 ; k<nbtries ; ++k) 145 { 146 timer.start(); 147 bench_eigengemm(mc, ma, mb, nbloops); 148 timer.stop(); 149 } 150 if (!(std::string(argv[1])=="auto")) 151 std::cout << "eigen : " << timer.value() << " (" << 1e-3*floor(1e-6*nbmad/timer.value()) << " GFlops/s)\n"; 152 else 153 std::cout << M << " : " << timer.value() << " ; " << 1e-3*floor(1e-6*nbmad/timer.value()) << "\n"; 154 } 155 156 std::cout << "l1: " << Eigen::l1CacheSize() << std::endl; 157 std::cout << "l2: " << Eigen::l2CacheSize() << std::endl; 158 159 160 return 0; 161 } 162 163 using namespace Eigen; 164 165 void bench_eigengemm(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb, int nbloops) 166 { 167 for (uint j=0 ; j<nbloops ; ++j) 168 mc.noalias() += ma * mb; 169 } 170 171 #define MYVERIFY(A,M) if (!(A)) { \ 172 std::cout << "FAIL: " << M << "\n"; \ 173 } 174 void check_product(int M, int N, int K) 175 { 176 MyMatrix ma(M,K), mb(K,N), mc(M,N), maT(K,M), mbT(N,K), meigen(M,N), mref(M,N); 177 ma = MyMatrix::Random(M,K); 178 mb = MyMatrix::Random(K,N); 179 maT = ma.transpose(); 180 mbT = mb.transpose(); 181 mc = MyMatrix::Random(M,N); 182 183 MyMatrix::Scalar eps = 1e-4; 184 185 meigen = mref = mc; 186 CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, ma.data(), M, mb.data(), K, 1, mref.data(), M); 187 meigen += ma * mb; 188 MYVERIFY(meigen.isApprox(mref, eps),". * ."); 189 190 meigen = mref = mc; 191 CBLAS_GEMM(CblasColMajor, CblasTrans, CblasNoTrans, M, N, K, 1, maT.data(), K, mb.data(), K, 1, mref.data(), M); 192 meigen += maT.transpose() * mb; 193 MYVERIFY(meigen.isApprox(mref, eps),"T * ."); 194 195 meigen = mref = mc; 196 CBLAS_GEMM(CblasColMajor, CblasTrans, CblasTrans, M, N, K, 1, maT.data(), K, mbT.data(), N, 1, mref.data(), M); 197 meigen += (maT.transpose()) * (mbT.transpose()); 198 MYVERIFY(meigen.isApprox(mref, eps),"T * T"); 199 200 meigen = mref = mc; 201 CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, ma.data(), M, mbT.data(), N, 1, mref.data(), M); 202 meigen += ma * mbT.transpose(); 203 MYVERIFY(meigen.isApprox(mref, eps),". * T"); 204 } 205 206 void check_product(void) 207 { 208 int M, N, K; 209 for (uint i=0; i<1000; ++i) 210 { 211 M = internal::random<int>(1,64); 212 N = internal::random<int>(1,768); 213 K = internal::random<int>(1,768); 214 M = (0 + M) * 1; 215 std::cout << M << " x " << N << " x " << K << "\n"; 216 check_product(M, N, K); 217 } 218 } 219 220