Home | History | Annotate | Download | only in test
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog (a] gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_FUNC cxx11_tensor_complex
     12 #define EIGEN_USE_GPU
     13 
     14 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     15 #include <cuda_fp16.h>
     16 #endif
     17 #include "main.h"
     18 #include <unsupported/Eigen/CXX11/Tensor>
     19 
     20 using Eigen::Tensor;
     21 
     22 void test_cuda_nullary() {
     23   Tensor<std::complex<float>, 1, 0, int> in1(2);
     24   Tensor<std::complex<float>, 1, 0, int> in2(2);
     25   in1.setRandom();
     26   in2.setRandom();
     27 
     28   std::size_t float_bytes = in1.size() * sizeof(float);
     29   std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
     30 
     31   std::complex<float>* d_in1;
     32   std::complex<float>* d_in2;
     33   float* d_out2;
     34   cudaMalloc((void**)(&d_in1), complex_bytes);
     35   cudaMalloc((void**)(&d_in2), complex_bytes);
     36   cudaMalloc((void**)(&d_out2), float_bytes);
     37   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
     38   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
     39 
     40   Eigen::CudaStreamDevice stream;
     41   Eigen::GpuDevice gpu_device(&stream);
     42 
     43   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
     44       d_in1, 2);
     45   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
     46       d_in2, 2);
     47   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
     48       d_out2, 2);
     49 
     50   gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
     51   gpu_out2.device(gpu_device) = gpu_in2.abs();
     52 
     53   Tensor<std::complex<float>, 1, 0, int> new1(2);
     54   Tensor<float, 1, 0, int> new2(2);
     55 
     56   assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
     57                          gpu_device.stream()) == cudaSuccess);
     58   assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
     59                          gpu_device.stream()) == cudaSuccess);
     60 
     61   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
     62 
     63   for (int i = 0; i < 2; ++i) {
     64     VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
     65     VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
     66   }
     67 
     68   cudaFree(d_in1);
     69   cudaFree(d_in2);
     70   cudaFree(d_out2);
     71 }
     72 
     73 
     74 static void test_cuda_sum_reductions() {
     75 
     76   Eigen::CudaStreamDevice stream;
     77   Eigen::GpuDevice gpu_device(&stream);
     78 
     79   const int num_rows = internal::random<int>(1024, 5*1024);
     80   const int num_cols = internal::random<int>(1024, 5*1024);
     81 
     82   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
     83   in.setRandom();
     84 
     85   Tensor<std::complex<float>, 0> full_redux;
     86   full_redux = in.sum();
     87 
     88   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
     89   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
     90   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
     91   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
     92   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
     93 
     94   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
     95   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
     96 
     97   out_gpu.device(gpu_device) = in_gpu.sum();
     98 
     99   Tensor<std::complex<float>, 0> full_redux_gpu;
    100   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
    101   gpu_device.synchronize();
    102 
    103   // Check that the CPU and GPU reductions return the same result.
    104   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
    105 
    106   gpu_device.deallocate(gpu_in_ptr);
    107   gpu_device.deallocate(gpu_out_ptr);
    108 }
    109 
    110 
    111 static void test_cuda_product_reductions() {
    112 
    113   Eigen::CudaStreamDevice stream;
    114   Eigen::GpuDevice gpu_device(&stream);
    115 
    116   const int num_rows = internal::random<int>(1024, 5*1024);
    117   const int num_cols = internal::random<int>(1024, 5*1024);
    118 
    119   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
    120   in.setRandom();
    121 
    122   Tensor<std::complex<float>, 0> full_redux;
    123   full_redux = in.prod();
    124 
    125   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
    126   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
    127   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
    128   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
    129   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
    130 
    131   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
    132   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
    133 
    134   out_gpu.device(gpu_device) = in_gpu.prod();
    135 
    136   Tensor<std::complex<float>, 0> full_redux_gpu;
    137   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
    138   gpu_device.synchronize();
    139 
    140   // Check that the CPU and GPU reductions return the same result.
    141   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
    142 
    143   gpu_device.deallocate(gpu_in_ptr);
    144   gpu_device.deallocate(gpu_out_ptr);
    145 }
    146 
    147 
    148 void test_cxx11_tensor_complex()
    149 {
    150   CALL_SUBTEST(test_cuda_nullary());
    151   CALL_SUBTEST(test_cuda_sum_reductions());
    152   CALL_SUBTEST(test_cuda_product_reductions());
    153 }
    154