1 2 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out 3 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out 4 5 #include <iostream> 6 #include <Eigen/Core> 7 #include <bench/BenchTimer.h> 8 9 using namespace std; 10 using namespace Eigen; 11 12 #ifndef SCALAR 13 // #define SCALAR std::complex<float> 14 #define SCALAR float 15 #endif 16 17 typedef SCALAR Scalar; 18 typedef NumTraits<Scalar>::Real RealScalar; 19 typedef Matrix<RealScalar,Dynamic,Dynamic> A; 20 typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B; 21 typedef Matrix<Scalar,Dynamic,Dynamic> C; 22 typedef Matrix<RealScalar,Dynamic,Dynamic> M; 23 24 #ifdef HAVE_BLAS 25 26 extern "C" { 27 #include <bench/btl/libs/C_BLAS/blas.h> 28 } 29 30 static float fone = 1; 31 static float fzero = 0; 32 static double done = 1; 33 static double szero = 0; 34 static std::complex<float> cfone = 1; 35 static std::complex<float> cfzero = 0; 36 static std::complex<double> cdone = 1; 37 static std::complex<double> cdzero = 0; 38 static char notrans = 'N'; 39 static char trans = 'T'; 40 static char nonunit = 'N'; 41 static char lower = 'L'; 42 static char right = 'R'; 43 static int intone = 1; 44 45 void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) 46 { 47 int M = c.rows(); int N = c.cols(); int K = a.cols(); 48 int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); 49 50 sgemm_(¬rans,¬rans,&M,&N,&K,&fone, 51 const_cast<float*>(a.data()),&lda, 52 const_cast<float*>(b.data()),&ldb,&fone, 53 c.data(),&ldc); 54 } 55 56 EIGEN_DONT_INLINE void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) 57 { 58 int M = c.rows(); int N = c.cols(); int K = a.cols(); 59 int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); 60 61 dgemm_(¬rans,¬rans,&M,&N,&K,&done, 62 const_cast<double*>(a.data()),&lda, 63 const_cast<double*>(b.data()),&ldb,&done, 64 c.data(),&ldc); 65 } 66 67 void blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c) 68 { 69 int M = c.rows(); int N = c.cols(); int K = a.cols(); 70 int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); 71 72 cgemm_(¬rans,¬rans,&M,&N,&K,(float*)&cfone, 73 const_cast<float*>((const float*)a.data()),&lda, 74 const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone, 75 (float*)c.data(),&ldc); 76 } 77 78 void blas_gemm(const MatrixXcd& a, const MatrixXcd& b, MatrixXcd& c) 79 { 80 int M = c.rows(); int N = c.cols(); int K = a.cols(); 81 int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); 82 83 zgemm_(¬rans,¬rans,&M,&N,&K,(double*)&cdone, 84 const_cast<double*>((const double*)a.data()),&lda, 85 const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone, 86 (double*)c.data(),&ldc); 87 } 88 89 90 91 #endif 92 93 void matlab_cplx_cplx(const M& ar, const M& ai, const M& br, const M& bi, M& cr, M& ci) 94 { 95 cr.noalias() += ar * br; 96 cr.noalias() -= ai * bi; 97 ci.noalias() += ar * bi; 98 ci.noalias() += ai * br; 99 } 100 101 void matlab_real_cplx(const M& a, const M& br, const M& bi, M& cr, M& ci) 102 { 103 cr.noalias() += a * br; 104 ci.noalias() += a * bi; 105 } 106 107 void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci) 108 { 109 cr.noalias() += ar * b; 110 ci.noalias() += ai * b; 111 } 112 113 template<typename A, typename B, typename C> 114 EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c) 115 { 116 c.noalias() += a * b; 117 } 118 119 int main(int argc, char ** argv) 120 { 121 std::ptrdiff_t l1 = internal::queryL1CacheSize(); 122 std::ptrdiff_t l2 = internal::queryTopLevelCacheSize(); 123 std::cout << "L1 cache size = " << (l1>0 ? l1/1024 : -1) << " KB\n"; 124 std::cout << "L2/L3 cache size = " << (l2>0 ? l2/1024 : -1) << " KB\n"; 125 typedef internal::gebp_traits<Scalar,Scalar> Traits; 126 std::cout << "Register blocking = " << Traits::mr << " x " << Traits::nr << "\n"; 127 128 int rep = 1; // number of repetitions per try 129 int tries = 2; // number of tries, we keep the best 130 131 int s = 2048; 132 int cache_size = -1; 133 134 bool need_help = false; 135 for (int i=1; i<argc; ++i) 136 { 137 if(argv[i][0]=='s') 138 s = atoi(argv[i]+1); 139 else if(argv[i][0]=='c') 140 cache_size = atoi(argv[i]+1); 141 else if(argv[i][0]=='t') 142 tries = atoi(argv[i]+1); 143 else if(argv[i][0]=='p') 144 rep = atoi(argv[i]+1); 145 else 146 need_help = true; 147 } 148 149 if(need_help) 150 { 151 std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n"; 152 return 1; 153 } 154 155 if(cache_size>0) 156 setCpuCacheSizes(cache_size,96*cache_size); 157 158 int m = s; 159 int n = s; 160 int p = s; 161 A a(m,p); a.setRandom(); 162 B b(p,n); b.setRandom(); 163 C c(m,n); c.setOnes(); 164 C rc = c; 165 166 std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n"; 167 std::ptrdiff_t mc(m), nc(n), kc(p); 168 internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); 169 std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << "\n"; 170 171 C r = c; 172 173 // check the parallel product is correct 174 #if defined EIGEN_HAS_OPENMP 175 int procs = omp_get_max_threads(); 176 if(procs>1) 177 { 178 #ifdef HAVE_BLAS 179 blas_gemm(a,b,r); 180 #else 181 omp_set_num_threads(1); 182 r.noalias() += a * b; 183 omp_set_num_threads(procs); 184 #endif 185 c.noalias() += a * b; 186 if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n"; 187 } 188 #elif defined HAVE_BLAS 189 blas_gemm(a,b,r); 190 c.noalias() += a * b; 191 if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; 192 #else 193 gemm(a,b,c); 194 r.noalias() += a.cast<Scalar>() * b.cast<Scalar>(); 195 if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; 196 #endif 197 198 #ifdef HAVE_BLAS 199 BenchTimer tblas; 200 c = rc; 201 BENCH(tblas, tries, rep, blas_gemm(a,b,c)); 202 std::cout << "blas cpu " << tblas.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(CPU_TIMER) << "s)\n"; 203 std::cout << "blas real " << tblas.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n"; 204 #endif 205 206 BenchTimer tmt; 207 c = rc; 208 BENCH(tmt, tries, rep, gemm(a,b,c)); 209 std::cout << "eigen cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; 210 std::cout << "eigen real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; 211 212 #ifdef EIGEN_HAS_OPENMP 213 if(procs>1) 214 { 215 BenchTimer tmono; 216 omp_set_num_threads(1); 217 Eigen::internal::setNbThreads(1); 218 c = rc; 219 BENCH(tmono, tries, rep, gemm(a,b,c)); 220 std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n"; 221 std::cout << "eigen mono real " << tmono.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n"; 222 std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER) << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n"; 223 } 224 #endif 225 226 #ifdef DECOUPLED 227 if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex)) 228 { 229 M ar(m,p); ar.setRandom(); 230 M ai(m,p); ai.setRandom(); 231 M br(p,n); br.setRandom(); 232 M bi(p,n); bi.setRandom(); 233 M cr(m,n); cr.setRandom(); 234 M ci(m,n); ci.setRandom(); 235 236 BenchTimer t; 237 BENCH(t, tries, rep, matlab_cplx_cplx(ar,ai,br,bi,cr,ci)); 238 std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; 239 std::cout << "\"matlab\" real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; 240 } 241 if((!NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex)) 242 { 243 M a(m,p); a.setRandom(); 244 M br(p,n); br.setRandom(); 245 M bi(p,n); bi.setRandom(); 246 M cr(m,n); cr.setRandom(); 247 M ci(m,n); ci.setRandom(); 248 249 BenchTimer t; 250 BENCH(t, tries, rep, matlab_real_cplx(a,br,bi,cr,ci)); 251 std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; 252 std::cout << "\"matlab\" real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; 253 } 254 if((NumTraits<A::Scalar>::IsComplex) && (!NumTraits<B::Scalar>::IsComplex)) 255 { 256 M ar(m,p); ar.setRandom(); 257 M ai(m,p); ai.setRandom(); 258 M b(p,n); b.setRandom(); 259 M cr(m,n); cr.setRandom(); 260 M ci(m,n); ci.setRandom(); 261 262 BenchTimer t; 263 BENCH(t, tries, rep, matlab_cplx_real(ar,ai,b,cr,ci)); 264 std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; 265 std::cout << "\"matlab\" real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; 266 } 267 #endif 268 269 return 0; 270 } 271 272