Home | History | Annotate | Download | only in test
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog (a] gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
     13 #define EIGEN_USE_GPU
     14 
     15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     16 #include <cuda_fp16.h>
     17 #endif
     18 #include "main.h"
     19 #include <unsupported/Eigen/CXX11/Tensor>
     20 
     21 
     22 template<typename Type, int DataLayout>
     23 static void test_full_reductions() {
     24 
     25   Eigen::CudaStreamDevice stream;
     26   Eigen::GpuDevice gpu_device(&stream);
     27 
     28   const int num_rows = internal::random<int>(1024, 5*1024);
     29   const int num_cols = internal::random<int>(1024, 5*1024);
     30 
     31   Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
     32   in.setRandom();
     33 
     34   Tensor<Type, 0, DataLayout> full_redux;
     35   full_redux = in.sum();
     36 
     37   std::size_t in_bytes = in.size() * sizeof(Type);
     38   std::size_t out_bytes = full_redux.size() * sizeof(Type);
     39   Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
     40   Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
     41   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
     42 
     43   TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
     44   TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
     45 
     46   out_gpu.device(gpu_device) = in_gpu.sum();
     47 
     48   Tensor<Type, 0, DataLayout> full_redux_gpu;
     49   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
     50   gpu_device.synchronize();
     51 
     52   // Check that the CPU and GPU reductions return the same result.
     53   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
     54 
     55   gpu_device.deallocate(gpu_in_ptr);
     56   gpu_device.deallocate(gpu_out_ptr);
     57 }
     58 
     59 template<typename Type, int DataLayout>
     60 static void test_first_dim_reductions() {
     61   int dim_x = 33;
     62   int dim_y = 1;
     63   int dim_z = 128;
     64 
     65   Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
     66   in.setRandom();
     67 
     68   Eigen::array<int, 1> red_axis;
     69   red_axis[0] = 0;
     70   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
     71 
     72   // Create device
     73   Eigen::CudaStreamDevice stream;
     74   Eigen::GpuDevice dev(&stream);
     75   
     76   // Create data(T)
     77   Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
     78   Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
     79   Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
     80   Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
     81   
     82   // Perform operation
     83   dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
     84   gpu_out.device(dev) = gpu_in.sum(red_axis);
     85   gpu_out.device(dev) += gpu_in.sum(red_axis);
     86   Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
     87   dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
     88   dev.synchronize();
     89 
     90   // Check that the CPU and GPU reductions return the same result.
     91   for (int i = 0; i < gpu_out.size(); ++i) {
     92     VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
     93   }
     94 
     95   dev.deallocate(in_data);
     96   dev.deallocate(out_data);
     97 }
     98 
     99 template<typename Type, int DataLayout>
    100 static void test_last_dim_reductions() {
    101   int dim_x = 128;
    102   int dim_y = 1;
    103   int dim_z = 33;
    104 
    105   Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
    106   in.setRandom();
    107 
    108   Eigen::array<int, 1> red_axis;
    109   red_axis[0] = 2;
    110   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
    111 
    112   // Create device
    113   Eigen::CudaStreamDevice stream;
    114   Eigen::GpuDevice dev(&stream);
    115   
    116   // Create data
    117   Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
    118   Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
    119   Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
    120   Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
    121   
    122   // Perform operation
    123   dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
    124   gpu_out.device(dev) = gpu_in.sum(red_axis);
    125   gpu_out.device(dev) += gpu_in.sum(red_axis);
    126   Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
    127   dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
    128   dev.synchronize();
    129 
    130   // Check that the CPU and GPU reductions return the same result.
    131   for (int i = 0; i < gpu_out.size(); ++i) {
    132     VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
    133   }
    134 
    135   dev.deallocate(in_data);
    136   dev.deallocate(out_data);
    137 }
    138 
    139 
    140 void test_cxx11_tensor_reduction_cuda() {
    141   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
    142   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
    143   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
    144   CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
    145   
    146   CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
    147   CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
    148   CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
    149 // Outer reductions of doubles aren't supported just yet.  					      
    150 //  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
    151 
    152   CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
    153 // Outer reductions of doubles aren't supported just yet.  					      
    154 //  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
    155   CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
    156   CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
    157 }
    158