1 2 #include <iostream> 3 #include <Eigen/Core> 4 #include <bench/BenchTimer.h> 5 using namespace Eigen; 6 7 #ifndef SIZE 8 #define SIZE 50 9 #endif 10 11 #ifndef REPEAT 12 #define REPEAT 10000 13 #endif 14 15 typedef float Scalar; 16 17 __attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); 18 __attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); 19 __attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); 20 21 int main(int argc, char* argv[]) 22 { 23 int size = SIZE * 8; 24 int size2 = size * size; 25 Scalar* a = internal::aligned_new<Scalar>(size2); 26 Scalar* b = internal::aligned_new<Scalar>(size2+4)+1; 27 Scalar* c = internal::aligned_new<Scalar>(size2); 28 29 for (int i=0; i<size; ++i) 30 { 31 a[i] = b[i] = c[i] = 0; 32 } 33 34 BenchTimer timer; 35 36 timer.reset(); 37 for (int k=0; k<10; ++k) 38 { 39 timer.start(); 40 benchVec(a, b, c, size2); 41 timer.stop(); 42 } 43 std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 44 return 0; 45 for (int innersize = size; innersize>2 ; --innersize) 46 { 47 if (size2%innersize==0) 48 { 49 int outersize = size2/innersize; 50 MatrixXf ma = Map<MatrixXf>(a, innersize, outersize ); 51 MatrixXf mb = Map<MatrixXf>(b, innersize, outersize ); 52 MatrixXf mc = Map<MatrixXf>(c, innersize, outersize ); 53 timer.reset(); 54 for (int k=0; k<3; ++k) 55 { 56 timer.start(); 57 benchVec(ma, mb, mc); 58 timer.stop(); 59 } 60 std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 61 } 62 } 63 64 VectorXf va = Map<VectorXf>(a, size2); 65 VectorXf vb = Map<VectorXf>(b, size2); 66 VectorXf vc = Map<VectorXf>(c, size2); 67 timer.reset(); 68 for (int k=0; k<3; ++k) 69 { 70 timer.start(); 71 benchVec(va, vb, vc); 72 timer.stop(); 73 } 74 std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 75 76 return 0; 77 } 78 79 void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) 80 { 81 for (int k=0; k<REPEAT; ++k) 82 a = a + b; 83 } 84 85 void benchVec(VectorXf& a, VectorXf& b, VectorXf& c) 86 { 87 for (int k=0; k<REPEAT; ++k) 88 a = a + b; 89 } 90 91 void benchVec(Scalar* a, Scalar* b, Scalar* c, int size) 92 { 93 typedef internal::packet_traits<Scalar>::type PacketScalar; 94 const int PacketSize = internal::packet_traits<Scalar>::size; 95 PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; 96 for (int k=0; k<REPEAT; ++k) 97 for (int i=0; i<size; i+=PacketSize*8) 98 { 99 // a0 = internal::pload(&a[i]); 100 // b0 = internal::pload(&b[i]); 101 // a1 = internal::pload(&a[i+1*PacketSize]); 102 // b1 = internal::pload(&b[i+1*PacketSize]); 103 // a2 = internal::pload(&a[i+2*PacketSize]); 104 // b2 = internal::pload(&b[i+2*PacketSize]); 105 // a3 = internal::pload(&a[i+3*PacketSize]); 106 // b3 = internal::pload(&b[i+3*PacketSize]); 107 // internal::pstore(&a[i], internal::padd(a0, b0)); 108 // a0 = internal::pload(&a[i+4*PacketSize]); 109 // b0 = internal::pload(&b[i+4*PacketSize]); 110 // 111 // internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1)); 112 // a1 = internal::pload(&a[i+5*PacketSize]); 113 // b1 = internal::pload(&b[i+5*PacketSize]); 114 // 115 // internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2)); 116 // a2 = internal::pload(&a[i+6*PacketSize]); 117 // b2 = internal::pload(&b[i+6*PacketSize]); 118 // 119 // internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3)); 120 // a3 = internal::pload(&a[i+7*PacketSize]); 121 // b3 = internal::pload(&b[i+7*PacketSize]); 122 // 123 // internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0)); 124 // internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1)); 125 // internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2)); 126 // internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3)); 127 128 internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize]))); 129 internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize]))); 130 internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize]))); 131 internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize]))); 132 internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize]))); 133 internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize]))); 134 } 135 } 136