Home | History | Annotate | Download | only in tensors
      1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
      2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
      3 
      4 typedef int TensorIndex;
      5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
      6 
      7 #include "unsupported/Eigen/CXX11/Tensor"
      8 #include "benchmark.h"
      9 
     10 #define BENCHMARK_RANGE(bench, lo, hi) \
     11   BENCHMARK(bench)->Range(lo, hi)
     12 
     13 using Eigen::Tensor;
     14 using Eigen::TensorMap;
     15 
     16 // TODO(bsteiner): also templatize on the input type since we have users
     17 // for int8 as well as floats.
     18 template <typename Device, typename T> class BenchmarkSuite {
     19  public:
     20   BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
     21       : m_(m), k_(k), n_(n), device_(device) {
     22     initialize();
     23   }
     24 
     25   BenchmarkSuite(const Device& device, size_t m)
     26       : m_(m), k_(m), n_(m), device_(device) {
     27     initialize();
     28   }
     29 
     30   ~BenchmarkSuite() {
     31     device_.deallocate(a_);
     32     device_.deallocate(b_);
     33     device_.deallocate(c_);
     34   }
     35 
     36   void memcpy(int num_iters) {
     37     eigen_assert(m_ == k_ && k_ == n_);
     38     StartBenchmarkTiming();
     39     for (int iter = 0; iter < num_iters; ++iter) {
     40       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
     41     }
     42     // Record the number of values copied per second
     43     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
     44   }
     45 
     46   void typeCasting(int num_iters) {
     47     eigen_assert(m_ == n_);
     48     Eigen::array<TensorIndex, 2> sizes;
     49     if (sizeof(T) >= sizeof(int)) {
     50       sizes[0] = m_;
     51       sizes[1] = k_;
     52     } else {
     53       sizes[0] = m_ * sizeof(T) / sizeof(int);
     54       sizes[1] = k_ * sizeof(T) / sizeof(int);
     55     }
     56     const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
     57     TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
     58 
     59     StartBenchmarkTiming();
     60     for (int iter = 0; iter < num_iters; ++iter) {
     61       B.device(device_) = A.template cast<T>();
     62     }
     63     // Record the number of values copied per second
     64     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
     65   }
     66 
     67   void random(int num_iters) {
     68     eigen_assert(m_ == k_ && k_ == n_);
     69     Eigen::array<TensorIndex, 2> sizes;
     70     sizes[0] = m_;
     71     sizes[1] = m_;
     72     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
     73 
     74     StartBenchmarkTiming();
     75     for (int iter = 0; iter < num_iters; ++iter) {
     76       C.device(device_) = C.random();
     77     }
     78     // Record the number of random numbers generated per second
     79     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
     80   }
     81 
     82   void slicing(int num_iters) {
     83     eigen_assert(m_ == k_ && k_ == n_);
     84     Eigen::array<TensorIndex, 2> sizes;
     85     sizes[0] = m_;
     86     sizes[1] = m_;
     87     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
     88     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     89     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
     90 
     91     const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
     92     const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
     93     const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
     94     const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
     95     const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
     96 
     97     StartBenchmarkTiming();
     98     for (int iter = 0; iter < num_iters; ++iter) {
     99       C.slice(first_quadrant, quarter_sizes).device(device_) =
    100           A.slice(first_quadrant, quarter_sizes);
    101       C.slice(second_quadrant, quarter_sizes).device(device_) =
    102           B.slice(second_quadrant, quarter_sizes);
    103       C.slice(third_quadrant, quarter_sizes).device(device_) =
    104           A.slice(third_quadrant, quarter_sizes);
    105       C.slice(fourth_quadrant, quarter_sizes).device(device_) =
    106           B.slice(fourth_quadrant, quarter_sizes);
    107     }
    108     // Record the number of values copied from the rhs slice to the lhs slice
    109     // each second
    110     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
    111   }
    112 
    113   void rowChip(int num_iters) {
    114     Eigen::array<TensorIndex, 2> input_size;
    115     input_size[0] = k_;
    116     input_size[1] = n_;
    117     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
    118     Eigen::array<TensorIndex, 1> output_size;
    119     output_size[0] = n_;
    120     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
    121 
    122     StartBenchmarkTiming();
    123     for (int iter = 0; iter < num_iters; ++iter) {
    124       C.device(device_) = B.chip(iter % k_, 0);
    125     }
    126     // Record the number of values copied from the rhs chip to the lhs.
    127     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
    128   }
    129 
    130   void colChip(int num_iters) {
    131     Eigen::array<TensorIndex, 2> input_size;
    132     input_size[0] = k_;
    133     input_size[1] = n_;
    134     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
    135     Eigen::array<TensorIndex, 1> output_size;
    136     output_size[0] = n_;
    137     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
    138 
    139     StartBenchmarkTiming();
    140     for (int iter = 0; iter < num_iters; ++iter) {
    141       C.device(device_) = B.chip(iter % n_, 1);
    142     }
    143     // Record the number of values copied from the rhs chip to the lhs.
    144     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
    145   }
    146 
    147   void shuffling(int num_iters) {
    148     eigen_assert(m_ == n_);
    149     Eigen::array<TensorIndex, 2> size_a;
    150     size_a[0] = m_;
    151     size_a[1] = k_;
    152     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    153     Eigen::array<TensorIndex, 2> size_b;
    154     size_b[0] = k_;
    155     size_b[1] = m_;
    156     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
    157 
    158     Eigen::array<int, 2> shuffle;
    159     shuffle[0] = 1;
    160     shuffle[1] = 0;
    161 
    162     StartBenchmarkTiming();
    163     for (int iter = 0; iter < num_iters; ++iter) {
    164       B.device(device_) = A.shuffle(shuffle);
    165     }
    166     // Record the number of values shuffled from A and copied to B each second
    167     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
    168   }
    169 
    170  void padding(int num_iters) {
    171     eigen_assert(m_ == k_);
    172     Eigen::array<TensorIndex, 2> size_a;
    173     size_a[0] = m_;
    174     size_a[1] = k_-3;
    175     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    176     Eigen::array<TensorIndex, 2> size_b;
    177     size_b[0] = k_;
    178     size_b[1] = m_;
    179     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
    180 
    181 #if defined(EIGEN_HAS_INDEX_LIST)
    182     Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
    183                          Eigen::type2indexpair<2, 1> > paddings;
    184 #else
    185     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
    186     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
    187     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
    188 #endif
    189 
    190     StartBenchmarkTiming();
    191     for (int iter = 0; iter < num_iters; ++iter) {
    192       B.device(device_) = A.pad(paddings);
    193     }
    194     // Record the number of values copied from the padded tensor A each second
    195     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
    196   }
    197 
    198  void striding(int num_iters) {
    199     eigen_assert(m_ == k_);
    200     Eigen::array<TensorIndex, 2> size_a;
    201     size_a[0] = m_;
    202     size_a[1] = k_;
    203     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    204     Eigen::array<TensorIndex, 2> size_b;
    205     size_b[0] = m_;
    206     size_b[1] = k_/2;
    207     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
    208 
    209 #ifndef EIGEN_HAS_INDEX_LIST
    210     Eigen::array<TensorIndex, 2> strides;
    211     strides[0] = 1;
    212     strides[1] = 2;
    213 #else
    214     // Take advantage of cxx11 to give the compiler information it can use to
    215     // optimize the code.
    216     Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
    217 #endif
    218 
    219     StartBenchmarkTiming();
    220     for (int iter = 0; iter < num_iters; ++iter) {
    221       B.device(device_) = A.stride(strides);
    222     }
    223     // Record the number of values copied from the padded tensor A each second
    224     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
    225   }
    226 
    227   void broadcasting(int num_iters) {
    228     Eigen::array<TensorIndex, 2> size_a;
    229     size_a[0] = m_;
    230     size_a[1] = 1;
    231     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
    232     Eigen::array<TensorIndex, 2> size_c;
    233     size_c[0] = m_;
    234     size_c[1] = n_;
    235     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
    236 
    237 #ifndef EIGEN_HAS_INDEX_LIST
    238     Eigen::array<int, 2> broadcast;
    239     broadcast[0] = 1;
    240     broadcast[1] = n_;
    241 #else
    242     // Take advantage of cxx11 to give the compiler information it can use to
    243     // optimize the code.
    244     Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
    245     broadcast.set(1, n_);
    246 #endif
    247 
    248     StartBenchmarkTiming();
    249     for (int iter = 0; iter < num_iters; ++iter) {
    250       C.device(device_) = A.broadcast(broadcast);
    251     }
    252     // Record the number of values broadcasted from A and copied to C each second
    253     finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
    254   }
    255 
    256   void coeffWiseOp(int num_iters) {
    257     eigen_assert(m_ == k_ && k_ == n_);
    258     Eigen::array<TensorIndex, 2> sizes;
    259     sizes[0] = m_;
    260     sizes[1] = m_;
    261     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    262     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    263     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    264 
    265     StartBenchmarkTiming();
    266     for (int iter = 0; iter < num_iters; ++iter) {
    267       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
    268     }
    269     // Record the number of FLOP executed per second (2 multiplications and
    270     // 1 addition per value)
    271     finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
    272   }
    273 
    274   void algebraicFunc(int num_iters) {
    275     eigen_assert(m_ == k_ && k_ == n_);
    276     Eigen::array<TensorIndex, 2> sizes;
    277     sizes[0] = m_;
    278     sizes[1] = m_;
    279     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    280     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    281     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    282 
    283     StartBenchmarkTiming();
    284     for (int iter = 0; iter < num_iters; ++iter) {
    285       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
    286     }
    287     // Record the number of FLOP executed per second (assuming one operation
    288     // per value)
    289     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
    290   }
    291 
    292   void transcendentalFunc(int num_iters) {
    293     eigen_assert(m_ == k_ && k_ == n_);
    294     Eigen::array<TensorIndex, 2> sizes;
    295     sizes[0] = m_;
    296     sizes[1] = m_;
    297     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    298     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    299     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
    300 
    301     StartBenchmarkTiming();
    302     for (int iter = 0; iter < num_iters; ++iter) {
    303       C.device(device_) = A.exp() + B.log();
    304     }
    305     // Record the number of FLOP executed per second (assuming one operation
    306     // per value)
    307     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
    308   }
    309 
    310  // Row reduction
    311   void rowReduction(int num_iters) {
    312     Eigen::array<TensorIndex, 2> input_size;
    313     input_size[0] = k_;
    314     input_size[1] = n_;
    315     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
    316     Eigen::array<TensorIndex, 1> output_size;
    317     output_size[0] = n_;
    318     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
    319 
    320 #ifndef EIGEN_HAS_INDEX_LIST
    321     Eigen::array<TensorIndex, 1> sum_along_dim;
    322     sum_along_dim[0] = 0;
    323 #else
    324     // Take advantage of cxx11 to give the compiler information it can use to
    325     // optimize the code.
    326     Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
    327 #endif
    328 
    329     StartBenchmarkTiming();
    330     for (int iter = 0; iter < num_iters; ++iter) {
    331       C.device(device_) = B.sum(sum_along_dim);
    332     }
    333     // Record the number of FLOP executed per second (assuming one operation
    334     // per value)
    335     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
    336   }
    337 
    338   // Column reduction
    339   void colReduction(int num_iters) {
    340     Eigen::array<TensorIndex, 2> input_size;
    341     input_size[0] = k_;
    342     input_size[1] = n_;
    343     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
    344         b_, input_size);
    345     Eigen::array<TensorIndex, 1> output_size;
    346     output_size[0] = k_;
    347     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
    348         c_, output_size);
    349 
    350 #ifndef EIGEN_HAS_INDEX_LIST
    351     Eigen::array<TensorIndex, 1> sum_along_dim;
    352     sum_along_dim[0] = 1;
    353 #else
    354     // Take advantage of cxx11 to give the compiler information it can use to
    355     // optimize the code.
    356     Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
    357 #endif
    358 
    359     StartBenchmarkTiming();
    360     for (int iter = 0; iter < num_iters; ++iter) {
    361       C.device(device_) = B.sum(sum_along_dim);
    362     }
    363     // Record the number of FLOP executed per second (assuming one operation
    364     // per value)
    365     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
    366   }
    367 
    368   // Full reduction
    369   void fullReduction(int num_iters) {
    370     Eigen::array<TensorIndex, 2> input_size;
    371     input_size[0] = k_;
    372     input_size[1] = n_;
    373     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
    374         b_, input_size);
    375     Eigen::array<TensorIndex, 0> output_size;
    376     TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
    377         c_, output_size);
    378 
    379     StartBenchmarkTiming();
    380     for (int iter = 0; iter < num_iters; ++iter) {
    381       C.device(device_) = B.sum();
    382     }
    383     // Record the number of FLOP executed per second (assuming one operation
    384     // per value)
    385     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
    386   }
    387 
    388   // do a contraction which is equivalent to a matrix multiplication
    389   void contraction(int num_iters) {
    390     Eigen::array<TensorIndex, 2> sizeA;
    391     sizeA[0] = m_;
    392     sizeA[1] = k_;
    393     Eigen::array<TensorIndex, 2> sizeB;
    394     sizeB[0] = k_;
    395     sizeB[1] = n_;
    396     Eigen::array<TensorIndex, 2> sizeC;
    397     sizeC[0] = m_;
    398     sizeC[1] = n_;
    399 
    400     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
    401     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
    402     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
    403 
    404     typedef typename Tensor<T, 2>::DimensionPair DimPair;
    405     Eigen::array<DimPair, 1> dims;
    406     dims[0] = DimPair(1, 0);
    407 
    408     StartBenchmarkTiming();
    409     for (int iter = 0; iter < num_iters; ++iter) {
    410       C.device(device_) = A.contract(B, dims);
    411     }
    412     // Record the number of FLOP executed per second (size_ multiplications and
    413     // additions for each value in the resulting tensor)
    414     finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
    415   }
    416 
    417   void convolution(int num_iters, int kernel_x, int kernel_y) {
    418     Eigen::array<TensorIndex, 2> input_sizes;
    419     input_sizes[0] = m_;
    420     input_sizes[1] = n_;
    421     TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
    422     Eigen::array<TensorIndex, 2> kernel_sizes;
    423     kernel_sizes[0] = kernel_x;
    424     kernel_sizes[1] = kernel_y;
    425     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
    426     Eigen::array<TensorIndex, 2> result_sizes;
    427     result_sizes[0] = m_ - kernel_x + 1;
    428     result_sizes[1] = n_ - kernel_y + 1;
    429     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
    430     Eigen::array<TensorIndex, 2> dims;
    431     dims[0] = 0;
    432     dims[1] = 1;
    433 
    434     StartBenchmarkTiming();
    435     for (int iter = 0; iter < num_iters; ++iter) {
    436       C.device(device_) = A.convolve(B, dims);
    437     }
    438     // Record the number of FLOP executed per second (kernel_size
    439     // multiplications and additions for each value in the resulting tensor)
    440     finalizeBenchmark(static_cast<int64_t>(2) *
    441         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
    442   }
    443 
    444  private:
    445   void initialize() {
    446     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    447     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
    448     c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
    449 
    450     // Initialize the content of the memory pools to prevent asan from
    451     // complaining.
    452     device_.memset(a_, 12, m_ * k_ * sizeof(T));
    453     device_.memset(b_, 23, k_ * n_ * sizeof(T));
    454     device_.memset(c_, 31, m_ * n_ * sizeof(T));
    455 
    456     //BenchmarkUseRealTime();
    457   }
    458 
    459   inline void finalizeBenchmark(int64_t num_items) {
    460 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
    461     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
    462       device_.synchronize();
    463     }
    464 #endif
    465     StopBenchmarkTiming();
    466     SetBenchmarkFlopsProcessed(num_items);
    467   }
    468 
    469 
    470   TensorIndex m_;
    471   TensorIndex k_;
    472   TensorIndex n_;
    473   T* a_;
    474   T* b_;
    475   T* c_;
    476   Device device_;
    477 };
    478 #endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
    479