Home | History | Annotate | Download | only in test
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (a] gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 #define EIGEN_TEST_FUNC cxx11_tensor_device
     13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
     14 #define EIGEN_USE_GPU
     15 
     16 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     17 #include <cuda_fp16.h>
     18 #endif
     19 #include "main.h"
     20 #include <unsupported/Eigen/CXX11/Tensor>
     21 
     22 using Eigen::Tensor;
     23 using Eigen::RowMajor;
     24 
     25 // Context for evaluation on cpu
     26 struct CPUContext {
     27   CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
     28     kernel_1d_(0) = 3.14f;
     29     kernel_1d_(1) = 2.7f;
     30 
     31     kernel_2d_(0,0) = 3.14f;
     32     kernel_2d_(1,0) = 2.7f;
     33     kernel_2d_(0,1) = 0.2f;
     34     kernel_2d_(1,1) = 7.0f;
     35 
     36     kernel_3d_(0,0,0) = 3.14f;
     37     kernel_3d_(0,1,0) = 2.7f;
     38     kernel_3d_(0,0,1) = 0.2f;
     39     kernel_3d_(0,1,1) = 7.0f;
     40     kernel_3d_(1,0,0) = -1.0f;
     41     kernel_3d_(1,1,0) = -0.3f;
     42     kernel_3d_(1,0,1) = -0.7f;
     43     kernel_3d_(1,1,1) = -0.5f;
     44   }
     45 
     46   const Eigen::DefaultDevice& device() const { return cpu_device_; }
     47 
     48   const Eigen::Tensor<float, 3>& in1() const { return in1_; }
     49   const Eigen::Tensor<float, 3>& in2() const { return in2_; }
     50   Eigen::Tensor<float, 3>& out() { return out_; }
     51   const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
     52   const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
     53   const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
     54 
     55  private:
     56   const Eigen::Tensor<float, 3>& in1_;
     57   const Eigen::Tensor<float, 3>& in2_;
     58   Eigen::Tensor<float, 3>& out_;
     59 
     60   Eigen::Tensor<float, 1> kernel_1d_;
     61   Eigen::Tensor<float, 2> kernel_2d_;
     62   Eigen::Tensor<float, 3> kernel_3d_;
     63 
     64   Eigen::DefaultDevice cpu_device_;
     65 };
     66 
     67 
     68 // Context for evaluation on GPU
     69 struct GPUContext {
     70   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
     71     assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
     72     float kernel_1d_val[] = {3.14f, 2.7f};
     73     assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
     74 
     75     assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
     76     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
     77     assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
     78 
     79     assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
     80     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
     81     assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
     82   }
     83   ~GPUContext() {
     84     assert(cudaFree(kernel_1d_) == cudaSuccess);
     85     assert(cudaFree(kernel_2d_) == cudaSuccess);
     86     assert(cudaFree(kernel_3d_) == cudaSuccess);
     87   }
     88 
     89   const Eigen::GpuDevice& device() const { return gpu_device_; }
     90 
     91   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
     92   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
     93   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
     94   Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
     95   Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
     96   Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
     97 
     98  private:
     99   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
    100   const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
    101   Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
    102 
    103   float* kernel_1d_;
    104   float* kernel_2d_;
    105   float* kernel_3d_;
    106 
    107   Eigen::CudaStreamDevice stream_;
    108   Eigen::GpuDevice gpu_device_;
    109 };
    110 
    111 
    112 // The actual expression to evaluate
    113 template <typename Context>
    114 void test_contextual_eval(Context* context)
    115 {
    116   context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
    117 }
    118 
    119 template <typename Context>
    120 void test_forced_contextual_eval(Context* context)
    121 {
    122   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
    123 }
    124 
    125 template <typename Context>
    126 void test_compound_assignment(Context* context)
    127 {
    128   context->out().device(context->device()) = context->in1().constant(2.718f);
    129   context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
    130 }
    131 
    132 
    133 template <typename Context>
    134 void test_contraction(Context* context)
    135 {
    136   Eigen::array<std::pair<int, int>, 2> dims;
    137   dims[0] = std::make_pair(1, 1);
    138   dims[1] = std::make_pair(2, 2);
    139 
    140   Eigen::array<int, 2> shape(40, 50*70);
    141 
    142   Eigen::DSizes<int, 2> indices(0,0);
    143   Eigen::DSizes<int, 2> sizes(40,40);
    144 
    145   context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
    146 }
    147 
    148 
    149 template <typename Context>
    150 void test_1d_convolution(Context* context)
    151 {
    152   Eigen::DSizes<int, 3> indices(0,0,0);
    153   Eigen::DSizes<int, 3> sizes(40,49,70);
    154 
    155   Eigen::array<int, 1> dims(1);
    156   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
    157 }
    158 
    159 template <typename Context>
    160 void test_2d_convolution(Context* context)
    161 {
    162   Eigen::DSizes<int, 3> indices(0,0,0);
    163   Eigen::DSizes<int, 3> sizes(40,49,69);
    164 
    165   Eigen::array<int, 2> dims(1,2);
    166   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
    167 }
    168 
    169 template <typename Context>
    170 void test_3d_convolution(Context* context)
    171 {
    172   Eigen::DSizes<int, 3> indices(0,0,0);
    173   Eigen::DSizes<int, 3> sizes(39,49,69);
    174 
    175   Eigen::array<int, 3> dims(0,1,2);
    176   context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
    177 }
    178 
    179 
    180 void test_cpu() {
    181   Eigen::Tensor<float, 3> in1(40,50,70);
    182   Eigen::Tensor<float, 3> in2(40,50,70);
    183   Eigen::Tensor<float, 3> out(40,50,70);
    184 
    185   in1 = in1.random() + in1.constant(10.0f);
    186   in2 = in2.random() + in2.constant(10.0f);
    187 
    188   CPUContext context(in1, in2, out);
    189   test_contextual_eval(&context);
    190   for (int i = 0; i < 40; ++i) {
    191     for (int j = 0; j < 50; ++j) {
    192       for (int k = 0; k < 70; ++k) {
    193         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    194       }
    195     }
    196   }
    197 
    198   test_forced_contextual_eval(&context);
    199   for (int i = 0; i < 40; ++i) {
    200     for (int j = 0; j < 50; ++j) {
    201       for (int k = 0; k < 70; ++k) {
    202         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
    203       }
    204     }
    205   }
    206 
    207   test_compound_assignment(&context);
    208   for (int i = 0; i < 40; ++i) {
    209     for (int j = 0; j < 50; ++j) {
    210       for (int k = 0; k < 70; ++k) {
    211         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    212       }
    213     }
    214   }
    215 
    216   test_contraction(&context);
    217   for (int i = 0; i < 40; ++i) {
    218     for (int j = 0; j < 40; ++j) {
    219       const float result = out(i,j,0);
    220       float expected = 0;
    221       for (int k = 0; k < 50; ++k) {
    222         for (int l = 0; l < 70; ++l) {
    223           expected += in1(i, k, l) * in2(j, k, l);
    224         }
    225       }
    226       VERIFY_IS_APPROX(expected, result);
    227     }
    228   }
    229 
    230   test_1d_convolution(&context);
    231   for (int i = 0; i < 40; ++i) {
    232     for (int j = 0; j < 49; ++j) {
    233       for (int k = 0; k < 70; ++k) {
    234         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
    235       }
    236     }
    237   }
    238 
    239   test_2d_convolution(&context);
    240   for (int i = 0; i < 40; ++i) {
    241     for (int j = 0; j < 49; ++j) {
    242       for (int k = 0; k < 69; ++k) {
    243         const float result = out(i,j,k);
    244         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
    245                                (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
    246         if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
    247           continue;
    248         }
    249         VERIFY_IS_APPROX(expected, result);
    250       }
    251     }
    252   }
    253 
    254   test_3d_convolution(&context);
    255   for (int i = 0; i < 39; ++i) {
    256     for (int j = 0; j < 49; ++j) {
    257       for (int k = 0; k < 69; ++k) {
    258         const float result = out(i,j,k);
    259         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
    260                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
    261                                (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
    262                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
    263         if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
    264           continue;
    265         }
    266         VERIFY_IS_APPROX(expected, result);
    267       }
    268     }
    269   }
    270 }
    271 
    272 void test_gpu() {
    273   Eigen::Tensor<float, 3> in1(40,50,70);
    274   Eigen::Tensor<float, 3> in2(40,50,70);
    275   Eigen::Tensor<float, 3> out(40,50,70);
    276   in1 = in1.random() + in1.constant(10.0f);
    277   in2 = in2.random() + in2.constant(10.0f);
    278 
    279   std::size_t in1_bytes = in1.size() * sizeof(float);
    280   std::size_t in2_bytes = in2.size() * sizeof(float);
    281   std::size_t out_bytes = out.size() * sizeof(float);
    282 
    283   float* d_in1;
    284   float* d_in2;
    285   float* d_out;
    286   cudaMalloc((void**)(&d_in1), in1_bytes);
    287   cudaMalloc((void**)(&d_in2), in2_bytes);
    288   cudaMalloc((void**)(&d_out), out_bytes);
    289 
    290   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
    291   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
    292 
    293   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
    294   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
    295   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
    296 
    297   GPUContext context(gpu_in1, gpu_in2, gpu_out);
    298   test_contextual_eval(&context);
    299   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
    300   for (int i = 0; i < 40; ++i) {
    301     for (int j = 0; j < 50; ++j) {
    302       for (int k = 0; k < 70; ++k) {
    303         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    304       }
    305     }
    306   }
    307 
    308   test_forced_contextual_eval(&context);
    309   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
    310   for (int i = 0; i < 40; ++i) {
    311     for (int j = 0; j < 50; ++j) {
    312       for (int k = 0; k < 70; ++k) {
    313         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
    314       }
    315     }
    316   }
    317 
    318   test_compound_assignment(&context);
    319   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
    320   for (int i = 0; i < 40; ++i) {
    321     for (int j = 0; j < 50; ++j) {
    322       for (int k = 0; k < 70; ++k) {
    323         VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
    324       }
    325     }
    326   }
    327 
    328   test_contraction(&context);
    329   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
    330   for (int i = 0; i < 40; ++i) {
    331     for (int j = 0; j < 40; ++j) {
    332       const float result = out(i,j,0);
    333       float expected = 0;
    334       for (int k = 0; k < 50; ++k) {
    335         for (int l = 0; l < 70; ++l) {
    336           expected += in1(i, k, l) * in2(j, k, l);
    337         }
    338       }
    339       VERIFY_IS_APPROX(expected, result);
    340     }
    341   }
    342 
    343   test_1d_convolution(&context);
    344   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
    345   assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
    346   for (int i = 0; i < 40; ++i) {
    347     for (int j = 0; j < 49; ++j) {
    348       for (int k = 0; k < 70; ++k) {
    349         VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
    350       }
    351     }
    352   }
    353 
    354   test_2d_convolution(&context);
    355   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
    356   assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
    357   for (int i = 0; i < 40; ++i) {
    358     for (int j = 0; j < 49; ++j) {
    359       for (int k = 0; k < 69; ++k) {
    360         const float result = out(i,j,k);
    361         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
    362                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
    363         VERIFY_IS_APPROX(expected, result);
    364       }
    365     }
    366   }
    367 
    368   test_3d_convolution(&context);
    369   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
    370   assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
    371   for (int i = 0; i < 39; ++i) {
    372     for (int j = 0; j < 49; ++j) {
    373       for (int k = 0; k < 69; ++k) {
    374        const float result = out(i,j,k);
    375         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
    376                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
    377                                 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
    378                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
    379         VERIFY_IS_APPROX(expected, result);
    380       }
    381     }
    382   }
    383 }
    384 
    385 
    386 void test_cxx11_tensor_device()
    387 {
    388   CALL_SUBTEST_1(test_cpu());
    389   CALL_SUBTEST_2(test_gpu());
    390 }
    391