Home | History | Annotate | Download | only in test
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (a] gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
     13 #define EIGEN_USE_GPU
     14 
     15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     16 #include <cuda_fp16.h>
     17 #endif
     18 #include "main.h"
     19 #include <unsupported/Eigen/CXX11/Tensor>
     20 
     21 using Eigen::Tensor;
     22 
     23 void test_cuda_nullary() {
     24   Tensor<float, 1, 0, int> in1(2);
     25   Tensor<float, 1, 0, int> in2(2);
     26   in1.setRandom();
     27   in2.setRandom();
     28 
     29   std::size_t tensor_bytes = in1.size() * sizeof(float);
     30 
     31   float* d_in1;
     32   float* d_in2;
     33   cudaMalloc((void**)(&d_in1), tensor_bytes);
     34   cudaMalloc((void**)(&d_in2), tensor_bytes);
     35   cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
     36   cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
     37 
     38   Eigen::CudaStreamDevice stream;
     39   Eigen::GpuDevice gpu_device(&stream);
     40 
     41   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
     42       d_in1, 2);
     43   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
     44       d_in2, 2);
     45 
     46   gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
     47   gpu_in2.device(gpu_device) = gpu_in2.random();
     48 
     49   Tensor<float, 1, 0, int> new1(2);
     50   Tensor<float, 1, 0, int> new2(2);
     51 
     52   assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
     53                          gpu_device.stream()) == cudaSuccess);
     54   assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
     55                          gpu_device.stream()) == cudaSuccess);
     56 
     57   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
     58 
     59   for (int i = 0; i < 2; ++i) {
     60     VERIFY_IS_APPROX(new1(i), 3.14f);
     61     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
     62   }
     63 
     64   cudaFree(d_in1);
     65   cudaFree(d_in2);
     66 }
     67 
     68 void test_cuda_elementwise_small() {
     69   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
     70   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
     71   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
     72   in1.setRandom();
     73   in2.setRandom();
     74 
     75   std::size_t in1_bytes = in1.size() * sizeof(float);
     76   std::size_t in2_bytes = in2.size() * sizeof(float);
     77   std::size_t out_bytes = out.size() * sizeof(float);
     78 
     79   float* d_in1;
     80   float* d_in2;
     81   float* d_out;
     82   cudaMalloc((void**)(&d_in1), in1_bytes);
     83   cudaMalloc((void**)(&d_in2), in2_bytes);
     84   cudaMalloc((void**)(&d_out), out_bytes);
     85 
     86   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
     87   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
     88 
     89   Eigen::CudaStreamDevice stream;
     90   Eigen::GpuDevice gpu_device(&stream);
     91 
     92   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
     93       d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
     94   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
     95       d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
     96   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
     97       d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
     98 
     99   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
    100 
    101   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
    102                          gpu_device.stream()) == cudaSuccess);
    103   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    104 
    105   for (int i = 0; i < 2; ++i) {
    106     VERIFY_IS_APPROX(
    107         out(Eigen::array<Eigen::DenseIndex, 1>(i)),
    108         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
    109   }
    110 
    111   cudaFree(d_in1);
    112   cudaFree(d_in2);
    113   cudaFree(d_out);
    114 }
    115 
    116 void test_cuda_elementwise()
    117 {
    118   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    119   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    120   Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    121   Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    122   in1.setRandom();
    123   in2.setRandom();
    124   in3.setRandom();
    125 
    126   std::size_t in1_bytes = in1.size() * sizeof(float);
    127   std::size_t in2_bytes = in2.size() * sizeof(float);
    128   std::size_t in3_bytes = in3.size() * sizeof(float);
    129   std::size_t out_bytes = out.size() * sizeof(float);
    130 
    131   float* d_in1;
    132   float* d_in2;
    133   float* d_in3;
    134   float* d_out;
    135   cudaMalloc((void**)(&d_in1), in1_bytes);
    136   cudaMalloc((void**)(&d_in2), in2_bytes);
    137   cudaMalloc((void**)(&d_in3), in3_bytes);
    138   cudaMalloc((void**)(&d_out), out_bytes);
    139 
    140   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
    141   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
    142   cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
    143 
    144   Eigen::CudaStreamDevice stream;
    145   Eigen::GpuDevice gpu_device(&stream);
    146 
    147   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    148   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    149   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    150   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
    151 
    152   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
    153 
    154   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    155   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    156 
    157   for (int i = 0; i < 72; ++i) {
    158     for (int j = 0; j < 53; ++j) {
    159       for (int k = 0; k < 97; ++k) {
    160         VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
    161       }
    162     }
    163   }
    164 
    165   cudaFree(d_in1);
    166   cudaFree(d_in2);
    167   cudaFree(d_in3);
    168   cudaFree(d_out);
    169 }
    170 
    171 void test_cuda_props() {
    172   Tensor<float, 1> in1(200);
    173   Tensor<bool, 1> out(200);
    174   in1.setRandom();
    175 
    176   std::size_t in1_bytes = in1.size() * sizeof(float);
    177   std::size_t out_bytes = out.size() * sizeof(bool);
    178 
    179   float* d_in1;
    180   bool* d_out;
    181   cudaMalloc((void**)(&d_in1), in1_bytes);
    182   cudaMalloc((void**)(&d_out), out_bytes);
    183 
    184   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
    185 
    186   Eigen::CudaStreamDevice stream;
    187   Eigen::GpuDevice gpu_device(&stream);
    188 
    189   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
    190       d_in1, 200);
    191   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
    192       d_out, 200);
    193 
    194   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
    195 
    196   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
    197                          gpu_device.stream()) == cudaSuccess);
    198   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    199 
    200   for (int i = 0; i < 200; ++i) {
    201     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
    202   }
    203 
    204   cudaFree(d_in1);
    205   cudaFree(d_out);
    206 }
    207 
    208 void test_cuda_reduction()
    209 {
    210   Tensor<float, 4> in1(72,53,97,113);
    211   Tensor<float, 2> out(72,97);
    212   in1.setRandom();
    213 
    214   std::size_t in1_bytes = in1.size() * sizeof(float);
    215   std::size_t out_bytes = out.size() * sizeof(float);
    216 
    217   float* d_in1;
    218   float* d_out;
    219   cudaMalloc((void**)(&d_in1), in1_bytes);
    220   cudaMalloc((void**)(&d_out), out_bytes);
    221 
    222   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
    223 
    224   Eigen::CudaStreamDevice stream;
    225   Eigen::GpuDevice gpu_device(&stream);
    226 
    227   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
    228   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
    229 
    230   array<Eigen::DenseIndex, 2> reduction_axis;
    231   reduction_axis[0] = 1;
    232   reduction_axis[1] = 3;
    233 
    234   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
    235 
    236   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    237   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    238 
    239   for (int i = 0; i < 72; ++i) {
    240     for (int j = 0; j < 97; ++j) {
    241       float expected = 0;
    242       for (int k = 0; k < 53; ++k) {
    243         for (int l = 0; l < 113; ++l) {
    244           expected =
    245               std::max<float>(expected, in1(i, k, j, l));
    246         }
    247       }
    248       VERIFY_IS_APPROX(out(i,j), expected);
    249     }
    250   }
    251 
    252   cudaFree(d_in1);
    253   cudaFree(d_out);
    254 }
    255 
    256 template<int DataLayout>
    257 void test_cuda_contraction()
    258 {
    259   // with these dimensions, the output has 300 * 140 elements, which is
    260   // more than 30 * 1024, which is the number of threads in blocks on
    261   // a 15 SM GK110 GPU
    262   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
    263   Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
    264   Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
    265 
    266   t_left.setRandom();
    267   t_right.setRandom();
    268 
    269   std::size_t t_left_bytes = t_left.size()  * sizeof(float);
    270   std::size_t t_right_bytes = t_right.size() * sizeof(float);
    271   std::size_t t_result_bytes = t_result.size() * sizeof(float);
    272 
    273   float* d_t_left;
    274   float* d_t_right;
    275   float* d_t_result;
    276 
    277   cudaMalloc((void**)(&d_t_left), t_left_bytes);
    278   cudaMalloc((void**)(&d_t_right), t_right_bytes);
    279   cudaMalloc((void**)(&d_t_result), t_result_bytes);
    280 
    281   cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
    282   cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
    283 
    284   Eigen::CudaStreamDevice stream;
    285   Eigen::GpuDevice gpu_device(&stream);
    286 
    287   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
    288   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
    289   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
    290 
    291   typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
    292   MapXf m_left(t_left.data(), 300, 93);
    293   MapXf m_right(t_right.data(), 93, 140);
    294   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
    295 
    296   typedef Tensor<float, 1>::DimensionPair DimPair;
    297   Eigen::array<DimPair, 2> dims;
    298   dims[0] = DimPair(2, 0);
    299   dims[1] = DimPair(3, 1);
    300 
    301   m_result = m_left * m_right;
    302   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
    303 
    304   cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
    305 
    306   for (DenseIndex i = 0; i < t_result.size(); i++) {
    307     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
    308       std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
    309       assert(false);
    310     }
    311   }
    312 
    313   cudaFree(d_t_left);
    314   cudaFree(d_t_right);
    315   cudaFree(d_t_result);
    316 }
    317 
    318 template<int DataLayout>
    319 void test_cuda_convolution_1d()
    320 {
    321   Tensor<float, 4, DataLayout> input(74,37,11,137);
    322   Tensor<float, 1, DataLayout> kernel(4);
    323   Tensor<float, 4, DataLayout> out(74,34,11,137);
    324   input = input.constant(10.0f) + input.random();
    325   kernel = kernel.constant(7.0f) + kernel.random();
    326 
    327   std::size_t input_bytes = input.size() * sizeof(float);
    328   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    329   std::size_t out_bytes = out.size() * sizeof(float);
    330 
    331   float* d_input;
    332   float* d_kernel;
    333   float* d_out;
    334   cudaMalloc((void**)(&d_input), input_bytes);
    335   cudaMalloc((void**)(&d_kernel), kernel_bytes);
    336   cudaMalloc((void**)(&d_out), out_bytes);
    337 
    338   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
    339   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
    340 
    341   Eigen::CudaStreamDevice stream;
    342   Eigen::GpuDevice gpu_device(&stream);
    343 
    344   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
    345   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
    346   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
    347 
    348   Eigen::array<Eigen::DenseIndex, 1> dims(1);
    349   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    350 
    351   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    352   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    353 
    354   for (int i = 0; i < 74; ++i) {
    355     for (int j = 0; j < 34; ++j) {
    356       for (int k = 0; k < 11; ++k) {
    357         for (int l = 0; l < 137; ++l) {
    358           const float result = out(i,j,k,l);
    359           const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
    360                                  input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
    361           VERIFY_IS_APPROX(result, expected);
    362         }
    363       }
    364     }
    365   }
    366 
    367   cudaFree(d_input);
    368   cudaFree(d_kernel);
    369   cudaFree(d_out);
    370 }
    371 
    372 void test_cuda_convolution_inner_dim_col_major_1d()
    373 {
    374   Tensor<float, 4, ColMajor> input(74,9,11,7);
    375   Tensor<float, 1, ColMajor> kernel(4);
    376   Tensor<float, 4, ColMajor> out(71,9,11,7);
    377   input = input.constant(10.0f) + input.random();
    378   kernel = kernel.constant(7.0f) + kernel.random();
    379 
    380   std::size_t input_bytes = input.size() * sizeof(float);
    381   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    382   std::size_t out_bytes = out.size() * sizeof(float);
    383 
    384   float* d_input;
    385   float* d_kernel;
    386   float* d_out;
    387   cudaMalloc((void**)(&d_input), input_bytes);
    388   cudaMalloc((void**)(&d_kernel), kernel_bytes);
    389   cudaMalloc((void**)(&d_out), out_bytes);
    390 
    391   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
    392   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
    393 
    394   Eigen::CudaStreamDevice stream;
    395   Eigen::GpuDevice gpu_device(&stream);
    396 
    397   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
    398   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
    399   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
    400 
    401   Eigen::array<Eigen::DenseIndex, 1> dims(0);
    402   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    403 
    404   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    405   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    406 
    407   for (int i = 0; i < 71; ++i) {
    408     for (int j = 0; j < 9; ++j) {
    409       for (int k = 0; k < 11; ++k) {
    410         for (int l = 0; l < 7; ++l) {
    411           const float result = out(i,j,k,l);
    412           const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
    413                                  input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
    414           VERIFY_IS_APPROX(result, expected);
    415         }
    416       }
    417     }
    418   }
    419 
    420   cudaFree(d_input);
    421   cudaFree(d_kernel);
    422   cudaFree(d_out);
    423 }
    424 
    425 void test_cuda_convolution_inner_dim_row_major_1d()
    426 {
    427   Tensor<float, 4, RowMajor> input(7,9,11,74);
    428   Tensor<float, 1, RowMajor> kernel(4);
    429   Tensor<float, 4, RowMajor> out(7,9,11,71);
    430   input = input.constant(10.0f) + input.random();
    431   kernel = kernel.constant(7.0f) + kernel.random();
    432 
    433   std::size_t input_bytes = input.size() * sizeof(float);
    434   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    435   std::size_t out_bytes = out.size() * sizeof(float);
    436 
    437   float* d_input;
    438   float* d_kernel;
    439   float* d_out;
    440   cudaMalloc((void**)(&d_input), input_bytes);
    441   cudaMalloc((void**)(&d_kernel), kernel_bytes);
    442   cudaMalloc((void**)(&d_out), out_bytes);
    443 
    444   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
    445   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
    446 
    447   Eigen::CudaStreamDevice stream;
    448   Eigen::GpuDevice gpu_device(&stream);
    449 
    450   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
    451   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
    452   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
    453 
    454   Eigen::array<Eigen::DenseIndex, 1> dims(3);
    455   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    456 
    457   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    458   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    459 
    460   for (int i = 0; i < 7; ++i) {
    461     for (int j = 0; j < 9; ++j) {
    462       for (int k = 0; k < 11; ++k) {
    463         for (int l = 0; l < 71; ++l) {
    464           const float result = out(i,j,k,l);
    465           const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
    466                                  input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
    467           VERIFY_IS_APPROX(result, expected);
    468         }
    469       }
    470     }
    471   }
    472 
    473   cudaFree(d_input);
    474   cudaFree(d_kernel);
    475   cudaFree(d_out);
    476 }
    477 
    478 template<int DataLayout>
    479 void test_cuda_convolution_2d()
    480 {
    481   Tensor<float, 4, DataLayout> input(74,37,11,137);
    482   Tensor<float, 2, DataLayout> kernel(3,4);
    483   Tensor<float, 4, DataLayout> out(74,35,8,137);
    484   input = input.constant(10.0f) + input.random();
    485   kernel = kernel.constant(7.0f) + kernel.random();
    486 
    487   std::size_t input_bytes = input.size() * sizeof(float);
    488   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    489   std::size_t out_bytes = out.size() * sizeof(float);
    490 
    491   float* d_input;
    492   float* d_kernel;
    493   float* d_out;
    494   cudaMalloc((void**)(&d_input), input_bytes);
    495   cudaMalloc((void**)(&d_kernel), kernel_bytes);
    496   cudaMalloc((void**)(&d_out), out_bytes);
    497 
    498   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
    499   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
    500 
    501   Eigen::CudaStreamDevice stream;
    502   Eigen::GpuDevice gpu_device(&stream);
    503 
    504   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
    505   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
    506   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
    507 
    508   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
    509   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    510 
    511   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    512   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    513 
    514   for (int i = 0; i < 74; ++i) {
    515     for (int j = 0; j < 35; ++j) {
    516       for (int k = 0; k < 8; ++k) {
    517         for (int l = 0; l < 137; ++l) {
    518           const float result = out(i,j,k,l);
    519           const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
    520                                  input(i,j+1,k+0,l) * kernel(1,0) +
    521                                  input(i,j+2,k+0,l) * kernel(2,0) +
    522                                  input(i,j+0,k+1,l) * kernel(0,1) +
    523                                  input(i,j+1,k+1,l) * kernel(1,1) +
    524                                  input(i,j+2,k+1,l) * kernel(2,1) +
    525                                  input(i,j+0,k+2,l) * kernel(0,2) +
    526                                  input(i,j+1,k+2,l) * kernel(1,2) +
    527                                  input(i,j+2,k+2,l) * kernel(2,2) +
    528                                  input(i,j+0,k+3,l) * kernel(0,3) +
    529                                  input(i,j+1,k+3,l) * kernel(1,3) +
    530                                  input(i,j+2,k+3,l) * kernel(2,3);
    531           VERIFY_IS_APPROX(result, expected);
    532         }
    533       }
    534     }
    535   }
    536 
    537   cudaFree(d_input);
    538   cudaFree(d_kernel);
    539   cudaFree(d_out);
    540 }
    541 
    542 template<int DataLayout>
    543 void test_cuda_convolution_3d()
    544 {
    545   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
    546   Tensor<float, 3, DataLayout> kernel(3,4,2);
    547   Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
    548   input = input.constant(10.0f) + input.random();
    549   kernel = kernel.constant(7.0f) + kernel.random();
    550 
    551   std::size_t input_bytes = input.size() * sizeof(float);
    552   std::size_t kernel_bytes = kernel.size() * sizeof(float);
    553   std::size_t out_bytes = out.size() * sizeof(float);
    554 
    555   float* d_input;
    556   float* d_kernel;
    557   float* d_out;
    558   cudaMalloc((void**)(&d_input), input_bytes);
    559   cudaMalloc((void**)(&d_kernel), kernel_bytes);
    560   cudaMalloc((void**)(&d_out), out_bytes);
    561 
    562   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
    563   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
    564 
    565   Eigen::CudaStreamDevice stream;    
    566   Eigen::GpuDevice gpu_device(&stream);
    567 
    568   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
    569   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
    570   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
    571 
    572   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
    573   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
    574 
    575   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    576   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    577 
    578   for (int i = 0; i < 74; ++i) {
    579     for (int j = 0; j < 35; ++j) {
    580       for (int k = 0; k < 8; ++k) {
    581         for (int l = 0; l < 136; ++l) {
    582           for (int m = 0; m < 17; ++m) {
    583             const float result = out(i,j,k,l,m);
    584             const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
    585                                    input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
    586                                    input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
    587                                    input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
    588                                    input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
    589                                    input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
    590                                    input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
    591                                    input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
    592                                    input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
    593                                    input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
    594                                    input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
    595                                    input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
    596                                    input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
    597                                    input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
    598                                    input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
    599                                    input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
    600                                    input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
    601                                    input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
    602                                    input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
    603                                    input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
    604                                    input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
    605                                    input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
    606                                    input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
    607                                    input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
    608             VERIFY_IS_APPROX(result, expected);
    609           }
    610         }
    611       }
    612     }
    613   }
    614 
    615   cudaFree(d_input);
    616   cudaFree(d_kernel);
    617   cudaFree(d_out);
    618 }
    619 
    620 
    621 template <typename Scalar>
    622 void test_cuda_lgamma(const Scalar stddev)
    623 {
    624   Tensor<Scalar, 2> in(72,97);
    625   in.setRandom();
    626   in *= in.constant(stddev);
    627   Tensor<Scalar, 2> out(72,97);
    628   out.setZero();
    629 
    630   std::size_t bytes = in.size() * sizeof(Scalar);
    631 
    632   Scalar* d_in;
    633   Scalar* d_out;
    634   cudaMalloc((void**)(&d_in), bytes);
    635   cudaMalloc((void**)(&d_out), bytes);
    636 
    637   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
    638 
    639   Eigen::CudaStreamDevice stream;
    640   Eigen::GpuDevice gpu_device(&stream);
    641 
    642   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
    643   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
    644 
    645   gpu_out.device(gpu_device) = gpu_in.lgamma();
    646 
    647   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    648   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    649 
    650   for (int i = 0; i < 72; ++i) {
    651     for (int j = 0; j < 97; ++j) {
    652       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
    653     }
    654   }
    655 
    656   cudaFree(d_in);
    657   cudaFree(d_out);
    658 }
    659 
    660 template <typename Scalar>
    661 void test_cuda_digamma()
    662 {
    663   Tensor<Scalar, 1> in(7);
    664   Tensor<Scalar, 1> out(7);
    665   Tensor<Scalar, 1> expected_out(7);
    666   out.setZero();
    667 
    668   in(0) = Scalar(1);
    669   in(1) = Scalar(1.5);
    670   in(2) = Scalar(4);
    671   in(3) = Scalar(-10.5);
    672   in(4) = Scalar(10000.5);
    673   in(5) = Scalar(0);
    674   in(6) = Scalar(-1);
    675 
    676   expected_out(0) = Scalar(-0.5772156649015329);
    677   expected_out(1) = Scalar(0.03648997397857645);
    678   expected_out(2) = Scalar(1.2561176684318);
    679   expected_out(3) = Scalar(2.398239129535781);
    680   expected_out(4) = Scalar(9.210340372392849);
    681   expected_out(5) = std::numeric_limits<Scalar>::infinity();
    682   expected_out(6) = std::numeric_limits<Scalar>::infinity();
    683 
    684   std::size_t bytes = in.size() * sizeof(Scalar);
    685 
    686   Scalar* d_in;
    687   Scalar* d_out;
    688   cudaMalloc((void**)(&d_in), bytes);
    689   cudaMalloc((void**)(&d_out), bytes);
    690 
    691   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
    692 
    693   Eigen::CudaStreamDevice stream;
    694   Eigen::GpuDevice gpu_device(&stream);
    695 
    696   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
    697   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
    698 
    699   gpu_out.device(gpu_device) = gpu_in.digamma();
    700 
    701   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    702   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    703 
    704   for (int i = 0; i < 5; ++i) {
    705     VERIFY_IS_APPROX(out(i), expected_out(i));
    706   }
    707   for (int i = 5; i < 7; ++i) {
    708     VERIFY_IS_EQUAL(out(i), expected_out(i));
    709   }
    710 
    711   cudaFree(d_in);
    712   cudaFree(d_out);
    713 }
    714 
    715 template <typename Scalar>
    716 void test_cuda_zeta()
    717 {
    718   Tensor<Scalar, 1> in_x(6);
    719   Tensor<Scalar, 1> in_q(6);
    720   Tensor<Scalar, 1> out(6);
    721   Tensor<Scalar, 1> expected_out(6);
    722   out.setZero();
    723 
    724   in_x(0) = Scalar(1);
    725   in_x(1) = Scalar(1.5);
    726   in_x(2) = Scalar(4);
    727   in_x(3) = Scalar(-10.5);
    728   in_x(4) = Scalar(10000.5);
    729   in_x(5) = Scalar(3);
    730   
    731   in_q(0) = Scalar(1.2345);
    732   in_q(1) = Scalar(2);
    733   in_q(2) = Scalar(1.5);
    734   in_q(3) = Scalar(3);
    735   in_q(4) = Scalar(1.0001);
    736   in_q(5) = Scalar(-2.5);
    737 
    738   expected_out(0) = std::numeric_limits<Scalar>::infinity();
    739   expected_out(1) = Scalar(1.61237534869);
    740   expected_out(2) = Scalar(0.234848505667);
    741   expected_out(3) = Scalar(1.03086757337e-5);
    742   expected_out(4) = Scalar(0.367879440865);
    743   expected_out(5) = Scalar(0.054102025820864097);
    744 
    745   std::size_t bytes = in_x.size() * sizeof(Scalar);
    746 
    747   Scalar* d_in_x;
    748   Scalar* d_in_q;
    749   Scalar* d_out;
    750   cudaMalloc((void**)(&d_in_x), bytes);
    751   cudaMalloc((void**)(&d_in_q), bytes);
    752   cudaMalloc((void**)(&d_out), bytes);
    753 
    754   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
    755   cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
    756   
    757   Eigen::CudaStreamDevice stream;
    758   Eigen::GpuDevice gpu_device(&stream);
    759 
    760   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
    761   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
    762   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
    763 
    764   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
    765 
    766   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    767   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    768 
    769   VERIFY_IS_EQUAL(out(0), expected_out(0));
    770   VERIFY((std::isnan)(out(3)));
    771 
    772   for (int i = 1; i < 6; ++i) {
    773     if (i != 3) {
    774       VERIFY_IS_APPROX(out(i), expected_out(i));
    775     }
    776   }
    777 
    778   cudaFree(d_in_x);
    779   cudaFree(d_in_q);
    780   cudaFree(d_out);
    781 }
    782 
    783 template <typename Scalar>
    784 void test_cuda_polygamma()
    785 {
    786   Tensor<Scalar, 1> in_x(7);
    787   Tensor<Scalar, 1> in_n(7);
    788   Tensor<Scalar, 1> out(7);
    789   Tensor<Scalar, 1> expected_out(7);
    790   out.setZero();
    791 
    792   in_n(0) = Scalar(1);
    793   in_n(1) = Scalar(1);
    794   in_n(2) = Scalar(1);
    795   in_n(3) = Scalar(17);
    796   in_n(4) = Scalar(31);
    797   in_n(5) = Scalar(28);
    798   in_n(6) = Scalar(8);
    799   
    800   in_x(0) = Scalar(2);
    801   in_x(1) = Scalar(3);
    802   in_x(2) = Scalar(25.5);
    803   in_x(3) = Scalar(4.7);
    804   in_x(4) = Scalar(11.8);
    805   in_x(5) = Scalar(17.7);
    806   in_x(6) = Scalar(30.2);
    807 
    808   expected_out(0) = Scalar(0.644934066848);
    809   expected_out(1) = Scalar(0.394934066848);
    810   expected_out(2) = Scalar(0.0399946696496);
    811   expected_out(3) = Scalar(293.334565435);
    812   expected_out(4) = Scalar(0.445487887616);
    813   expected_out(5) = Scalar(-2.47810300902e-07);
    814   expected_out(6) = Scalar(-8.29668781082e-09);
    815 
    816   std::size_t bytes = in_x.size() * sizeof(Scalar);
    817 
    818   Scalar* d_in_x;
    819   Scalar* d_in_n;
    820   Scalar* d_out;
    821   cudaMalloc((void**)(&d_in_x), bytes);
    822   cudaMalloc((void**)(&d_in_n), bytes);
    823   cudaMalloc((void**)(&d_out), bytes);
    824 
    825   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
    826   cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
    827   
    828   Eigen::CudaStreamDevice stream;
    829   Eigen::GpuDevice gpu_device(&stream);
    830 
    831   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
    832   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
    833   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
    834 
    835   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
    836 
    837   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    838   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    839 
    840   for (int i = 0; i < 7; ++i) {
    841     VERIFY_IS_APPROX(out(i), expected_out(i));
    842   }
    843 
    844   cudaFree(d_in_x);
    845   cudaFree(d_in_n);
    846   cudaFree(d_out);
    847 }
    848 
    849 template <typename Scalar>
    850 void test_cuda_igamma()
    851 {
    852   Tensor<Scalar, 2> a(6, 6);
    853   Tensor<Scalar, 2> x(6, 6);
    854   Tensor<Scalar, 2> out(6, 6);
    855   out.setZero();
    856 
    857   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    858   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    859 
    860   for (int i = 0; i < 6; ++i) {
    861     for (int j = 0; j < 6; ++j) {
    862       a(i, j) = a_s[i];
    863       x(i, j) = x_s[j];
    864     }
    865   }
    866 
    867   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
    868   Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
    869                           {0.0, 0.6321205588285578, 0.7768698398515702,
    870                            0.9816843611112658, 9.999500016666262e-05, 1.0},
    871                           {0.0, 0.4275932955291202, 0.608374823728911,
    872                            0.9539882943107686, 7.522076445089201e-07, 1.0},
    873                           {0.0, 0.01898815687615381, 0.06564245437845008,
    874                            0.5665298796332909, 4.166333347221828e-18, 1.0},
    875                           {0.0, 0.9999780593618628, 0.9999899967080838,
    876                            0.9999996219837988, 0.9991370418689945, 1.0},
    877                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
    878 
    879 
    880 
    881   std::size_t bytes = a.size() * sizeof(Scalar);
    882 
    883   Scalar* d_a;
    884   Scalar* d_x;
    885   Scalar* d_out;
    886   assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
    887   assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
    888   assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
    889 
    890   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
    891   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
    892 
    893   Eigen::CudaStreamDevice stream;
    894   Eigen::GpuDevice gpu_device(&stream);
    895 
    896   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
    897   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
    898   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
    899 
    900   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
    901 
    902   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    903   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    904 
    905   for (int i = 0; i < 6; ++i) {
    906     for (int j = 0; j < 6; ++j) {
    907       if ((std::isnan)(igamma_s[i][j])) {
    908         VERIFY((std::isnan)(out(i, j)));
    909       } else {
    910         VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
    911       }
    912     }
    913   }
    914 
    915   cudaFree(d_a);
    916   cudaFree(d_x);
    917   cudaFree(d_out);
    918 }
    919 
    920 template <typename Scalar>
    921 void test_cuda_igammac()
    922 {
    923   Tensor<Scalar, 2> a(6, 6);
    924   Tensor<Scalar, 2> x(6, 6);
    925   Tensor<Scalar, 2> out(6, 6);
    926   out.setZero();
    927 
    928   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    929   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
    930 
    931   for (int i = 0; i < 6; ++i) {
    932     for (int j = 0; j < 6; ++j) {
    933       a(i, j) = a_s[i];
    934       x(i, j) = x_s[j];
    935     }
    936   }
    937 
    938   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
    939   Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
    940                            {1.0, 0.36787944117144233, 0.22313016014842982,
    941                             0.018315638888734182, 0.9999000049998333, 0.0},
    942                            {1.0, 0.5724067044708798, 0.3916251762710878,
    943                             0.04601170568923136, 0.9999992477923555, 0.0},
    944                            {1.0, 0.9810118431238462, 0.9343575456215499,
    945                             0.4334701203667089, 1.0, 0.0},
    946                            {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
    947                             3.7801620118431334e-07, 0.0008629581310054535,
    948                             0.0},
    949                            {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
    950 
    951   std::size_t bytes = a.size() * sizeof(Scalar);
    952 
    953   Scalar* d_a;
    954   Scalar* d_x;
    955   Scalar* d_out;
    956   cudaMalloc((void**)(&d_a), bytes);
    957   cudaMalloc((void**)(&d_x), bytes);
    958   cudaMalloc((void**)(&d_out), bytes);
    959 
    960   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
    961   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
    962 
    963   Eigen::CudaStreamDevice stream;
    964   Eigen::GpuDevice gpu_device(&stream);
    965 
    966   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
    967   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
    968   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
    969 
    970   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
    971 
    972   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    973   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    974 
    975   for (int i = 0; i < 6; ++i) {
    976     for (int j = 0; j < 6; ++j) {
    977       if ((std::isnan)(igammac_s[i][j])) {
    978         VERIFY((std::isnan)(out(i, j)));
    979       } else {
    980         VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
    981       }
    982     }
    983   }
    984 
    985   cudaFree(d_a);
    986   cudaFree(d_x);
    987   cudaFree(d_out);
    988 }
    989 
    990 template <typename Scalar>
    991 void test_cuda_erf(const Scalar stddev)
    992 {
    993   Tensor<Scalar, 2> in(72,97);
    994   in.setRandom();
    995   in *= in.constant(stddev);
    996   Tensor<Scalar, 2> out(72,97);
    997   out.setZero();
    998 
    999   std::size_t bytes = in.size() * sizeof(Scalar);
   1000 
   1001   Scalar* d_in;
   1002   Scalar* d_out;
   1003   assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
   1004   assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
   1005 
   1006   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
   1007 
   1008   Eigen::CudaStreamDevice stream;
   1009   Eigen::GpuDevice gpu_device(&stream);
   1010 
   1011   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
   1012   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
   1013 
   1014   gpu_out.device(gpu_device) = gpu_in.erf();
   1015 
   1016   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
   1017   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
   1018 
   1019   for (int i = 0; i < 72; ++i) {
   1020     for (int j = 0; j < 97; ++j) {
   1021       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
   1022     }
   1023   }
   1024 
   1025   cudaFree(d_in);
   1026   cudaFree(d_out);
   1027 }
   1028 
   1029 template <typename Scalar>
   1030 void test_cuda_erfc(const Scalar stddev)
   1031 {
   1032   Tensor<Scalar, 2> in(72,97);
   1033   in.setRandom();
   1034   in *= in.constant(stddev);
   1035   Tensor<Scalar, 2> out(72,97);
   1036   out.setZero();
   1037 
   1038   std::size_t bytes = in.size() * sizeof(Scalar);
   1039 
   1040   Scalar* d_in;
   1041   Scalar* d_out;
   1042   cudaMalloc((void**)(&d_in), bytes);
   1043   cudaMalloc((void**)(&d_out), bytes);
   1044 
   1045   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
   1046 
   1047   Eigen::CudaStreamDevice stream;
   1048   Eigen::GpuDevice gpu_device(&stream);
   1049 
   1050   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
   1051   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
   1052 
   1053   gpu_out.device(gpu_device) = gpu_in.erfc();
   1054 
   1055   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
   1056   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
   1057 
   1058   for (int i = 0; i < 72; ++i) {
   1059     for (int j = 0; j < 97; ++j) {
   1060       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
   1061     }
   1062   }
   1063 
   1064   cudaFree(d_in);
   1065   cudaFree(d_out);
   1066 }
   1067 
   1068 template <typename Scalar>
   1069 void test_cuda_betainc()
   1070 {
   1071   Tensor<Scalar, 1> in_x(125);
   1072   Tensor<Scalar, 1> in_a(125);
   1073   Tensor<Scalar, 1> in_b(125);
   1074   Tensor<Scalar, 1> out(125);
   1075   Tensor<Scalar, 1> expected_out(125);
   1076   out.setZero();
   1077 
   1078   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
   1079 
   1080   Array<Scalar, 1, Dynamic> x(125);
   1081   Array<Scalar, 1, Dynamic> a(125);
   1082   Array<Scalar, 1, Dynamic> b(125);
   1083   Array<Scalar, 1, Dynamic> v(125);
   1084 
   1085   a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
   1086       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
   1087       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1088       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1089       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1090       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1091       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1092       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1093       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1094       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1095       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
   1096       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
   1097       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
   1098       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1099       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1100       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1101       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1102       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1103       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1104       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1105       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1106       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
   1107       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
   1108       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
   1109 
   1110   b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
   1111       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
   1112       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
   1113       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1114       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
   1115       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1116       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
   1117       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1118       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
   1119       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
   1120       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
   1121       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
   1122       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1123       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
   1124       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
   1125       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
   1126       31.62177660168379, 31.62177660168379, 31.62177660168379,
   1127       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
   1128       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
   1129       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
   1130       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
   1131       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
   1132       999.999, 999.999, 999.999;
   1133 
   1134   x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
   1135       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
   1136       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
   1137       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
   1138       0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
   1139       -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
   1140       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
   1141       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
   1142       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
   1143 
   1144   v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
   1145       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
   1146       nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
   1147       0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
   1148       0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
   1149       0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
   1150       nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
   1151       0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
   1152       0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
   1153       0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
   1154       0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
   1155       1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
   1156       nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
   1157       nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
   1158       nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
   1159       nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
   1160       2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
   1161       1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
   1162       2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
   1163       0.5000000000004947, 0.9999999999999999, nan;
   1164 
   1165   for (int i = 0; i < 125; ++i) {
   1166     in_x(i) = x(i);
   1167     in_a(i) = a(i);
   1168     in_b(i) = b(i);
   1169     expected_out(i) = v(i);
   1170   }
   1171 
   1172   std::size_t bytes = in_x.size() * sizeof(Scalar);
   1173 
   1174   Scalar* d_in_x;
   1175   Scalar* d_in_a;
   1176   Scalar* d_in_b;
   1177   Scalar* d_out;
   1178   cudaMalloc((void**)(&d_in_x), bytes);
   1179   cudaMalloc((void**)(&d_in_a), bytes);
   1180   cudaMalloc((void**)(&d_in_b), bytes);
   1181   cudaMalloc((void**)(&d_out), bytes);
   1182 
   1183   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
   1184   cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
   1185   cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
   1186 
   1187   Eigen::CudaStreamDevice stream;
   1188   Eigen::GpuDevice gpu_device(&stream);
   1189 
   1190   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
   1191   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
   1192   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
   1193   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
   1194 
   1195   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
   1196 
   1197   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
   1198   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
   1199 
   1200   for (int i = 1; i < 125; ++i) {
   1201     if ((std::isnan)(expected_out(i))) {
   1202       VERIFY((std::isnan)(out(i)));
   1203     } else {
   1204       VERIFY_IS_APPROX(out(i), expected_out(i));
   1205     }
   1206   }
   1207 
   1208   cudaFree(d_in_x);
   1209   cudaFree(d_in_a);
   1210   cudaFree(d_in_b);
   1211   cudaFree(d_out);
   1212 }
   1213 
   1214 
   1215 void test_cxx11_tensor_cuda()
   1216 {
   1217   CALL_SUBTEST_1(test_cuda_nullary());
   1218   CALL_SUBTEST_1(test_cuda_elementwise_small());
   1219   CALL_SUBTEST_1(test_cuda_elementwise());
   1220   CALL_SUBTEST_1(test_cuda_props());
   1221   CALL_SUBTEST_1(test_cuda_reduction());
   1222   CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
   1223   CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
   1224   CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
   1225   CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
   1226   CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
   1227   CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
   1228   CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
   1229   CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
   1230   CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
   1231   CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
   1232 
   1233 #if __cplusplus > 199711L
   1234   // std::erf, std::erfc, and so on where only added in c++11. We use them
   1235   // as a golden reference to validate the results produced by Eigen. Therefore
   1236   // we can only run these tests if we use a c++11 compiler.
   1237   CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
   1238   CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
   1239   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
   1240   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
   1241 
   1242   CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
   1243   CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
   1244   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
   1245   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
   1246 
   1247   CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
   1248   CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
   1249   CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
   1250   CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
   1251 
   1252   CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
   1253   // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
   1254   CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
   1255   CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
   1256   CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
   1257 
   1258   CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
   1259   CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
   1260   CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
   1261   CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
   1262 
   1263   CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
   1264   // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
   1265   CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
   1266   CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
   1267   CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
   1268 
   1269   CALL_SUBTEST_5(test_cuda_digamma<float>());
   1270   CALL_SUBTEST_5(test_cuda_digamma<double>());
   1271 
   1272   CALL_SUBTEST_5(test_cuda_polygamma<float>());
   1273   CALL_SUBTEST_5(test_cuda_polygamma<double>());
   1274 
   1275   CALL_SUBTEST_5(test_cuda_zeta<float>());
   1276   CALL_SUBTEST_5(test_cuda_zeta<double>());
   1277 
   1278   CALL_SUBTEST_5(test_cuda_igamma<float>());
   1279   CALL_SUBTEST_5(test_cuda_igammac<float>());
   1280 
   1281   CALL_SUBTEST_5(test_cuda_igamma<double>());
   1282   CALL_SUBTEST_5(test_cuda_igammac<double>());
   1283 
   1284   CALL_SUBTEST_6(test_cuda_betainc<float>());
   1285   CALL_SUBTEST_6(test_cuda_betainc<double>());
   1286 #endif
   1287 }
   1288