Home | History | Annotate | Download | only in tensors
      1 #define EIGEN_USE_THREADS
      2 
      3 #include <string>
      4 
      5 #include "tensor_benchmarks.h"
      6 
      7 #define CREATE_THREAD_POOL(threads)             \
      8 Eigen::ThreadPool pool(threads);                \
      9 Eigen::ThreadPoolDevice device(&pool, threads);
     10 
     11 // Simple functions
     12 #define BM_FuncCPU(FUNC, THREADS)                                    \
     13   static void BM_##FUNC##_##THREADS##T(int iters, int N) {           \
     14     StopBenchmarkTiming();                                           \
     15     CREATE_THREAD_POOL(THREADS);                                     \
     16     BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
     17     suite.FUNC(iters);                                               \
     18   }                                                                  \
     19   BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
     20 
     21 BM_FuncCPU(memcpy, 4);
     22 BM_FuncCPU(memcpy, 8);
     23 BM_FuncCPU(memcpy, 12);
     24 
     25 BM_FuncCPU(typeCasting, 4);
     26 BM_FuncCPU(typeCasting, 8);
     27 BM_FuncCPU(typeCasting, 12);
     28 
     29 BM_FuncCPU(random, 4);
     30 BM_FuncCPU(random, 8);
     31 BM_FuncCPU(random, 12);
     32 
     33 BM_FuncCPU(slicing, 4);
     34 BM_FuncCPU(slicing, 8);
     35 BM_FuncCPU(slicing, 12);
     36 
     37 BM_FuncCPU(rowChip, 4);
     38 BM_FuncCPU(rowChip, 8);
     39 BM_FuncCPU(rowChip, 12);
     40 
     41 BM_FuncCPU(colChip, 4);
     42 BM_FuncCPU(colChip, 8);
     43 BM_FuncCPU(colChip, 12);
     44 
     45 BM_FuncCPU(shuffling, 4);
     46 BM_FuncCPU(shuffling, 8);
     47 BM_FuncCPU(shuffling, 12);
     48 
     49 BM_FuncCPU(padding, 4);
     50 BM_FuncCPU(padding, 8);
     51 BM_FuncCPU(padding, 12);
     52 
     53 BM_FuncCPU(striding, 4);
     54 BM_FuncCPU(striding, 8);
     55 BM_FuncCPU(striding, 12);
     56 
     57 BM_FuncCPU(broadcasting, 4);
     58 BM_FuncCPU(broadcasting, 8);
     59 BM_FuncCPU(broadcasting, 12);
     60 
     61 BM_FuncCPU(coeffWiseOp, 4);
     62 BM_FuncCPU(coeffWiseOp, 8);
     63 BM_FuncCPU(coeffWiseOp, 12);
     64 
     65 BM_FuncCPU(algebraicFunc, 4);
     66 BM_FuncCPU(algebraicFunc, 8);
     67 BM_FuncCPU(algebraicFunc, 12);
     68 
     69 BM_FuncCPU(transcendentalFunc, 4);
     70 BM_FuncCPU(transcendentalFunc, 8);
     71 BM_FuncCPU(transcendentalFunc, 12);
     72 
     73 BM_FuncCPU(rowReduction, 4);
     74 BM_FuncCPU(rowReduction, 8);
     75 BM_FuncCPU(rowReduction, 12);
     76 
     77 BM_FuncCPU(colReduction, 4);
     78 BM_FuncCPU(colReduction, 8);
     79 BM_FuncCPU(colReduction, 12);
     80 
     81 
     82 // Contractions
     83 #define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                      \
     84   static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
     85     StopBenchmarkTiming();                                                      \
     86     if (THREADS == 1) {                                                         \
     87       Eigen::DefaultDevice device;                                              \
     88       BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3);    \
     89       suite.FUNC(iters);                                                        \
     90     } else {                                                                    \
     91       CREATE_THREAD_POOL(THREADS);                                              \
     92       BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
     93       suite.FUNC(iters);                                                        \
     94     }                                                                           \
     95   }                                                                             \
     96   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
     97 
     98 
     99 BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
    100 BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
    101 BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
    102 BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
    103 BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
    104 
    105 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
    106 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
    107 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
    108 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
    109 BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
    110 
    111 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
    112 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
    113 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
    114 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
    115 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
    116 
    117 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
    118 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
    119 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
    120 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
    121 BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
    122 
    123 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
    124 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
    125 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
    126 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
    127 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
    128 
    129 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
    130 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
    131 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
    132 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
    133 BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
    134 
    135 
    136 // Convolutions
    137 #define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
    138   static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
    139     StopBenchmarkTiming();                                                     \
    140     CREATE_THREAD_POOL(THREADS);                                               \
    141     BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N);	       \
    142     suite.FUNC(iters, DIM1, DIM2);                                             \
    143   }                                                                            \
    144   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
    145 
    146 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
    147 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
    148 BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
    149 
    150 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
    151 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
    152 BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
    153 
    154 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
    155 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
    156 BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
    157 
    158 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
    159 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
    160 BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
    161 
    162 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
    163 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
    164 BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
    165 
    166 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
    167 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
    168 BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
    169