Home | History | Annotate | Download | only in test
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (a] gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 
     11 #define EIGEN_TEST_NO_LONGDOUBLE
     12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
     13 #define EIGEN_USE_GPU
     14 
     15 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     16 #include <cuda_fp16.h>
     17 #endif
     18 #include "main.h"
     19 #include <unsupported/Eigen/CXX11/Tensor>
     20 
     21 using Eigen::Tensor;
     22 
     23 template <int Layout>
     24 void test_cuda_simple_argmax()
     25 {
     26   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
     27   Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
     28   Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
     29   in.setRandom();
     30   in *= in.constant(100.0);
     31   in(0, 0, 0) = -1000.0;
     32   in(71, 52, 96) = 1000.0;
     33 
     34   std::size_t in_bytes = in.size() * sizeof(double);
     35   std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
     36 
     37   double* d_in;
     38   DenseIndex* d_out_max;
     39   DenseIndex* d_out_min;
     40   cudaMalloc((void**)(&d_in), in_bytes);
     41   cudaMalloc((void**)(&d_out_max), out_bytes);
     42   cudaMalloc((void**)(&d_out_min), out_bytes);
     43 
     44   cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
     45 
     46   Eigen::CudaStreamDevice stream;
     47   Eigen::GpuDevice gpu_device(&stream);
     48 
     49   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
     50   Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
     51   Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
     52 
     53   gpu_out_max.device(gpu_device) = gpu_in.argmax();
     54   gpu_out_min.device(gpu_device) = gpu_in.argmin();
     55 
     56   assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
     57   assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
     58   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
     59 
     60   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
     61   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
     62 
     63   cudaFree(d_in);
     64   cudaFree(d_out_max);
     65   cudaFree(d_out_min);
     66 }
     67 
     68 template <int DataLayout>
     69 void test_cuda_argmax_dim()
     70 {
     71   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
     72   std::vector<int> dims;
     73   dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
     74 
     75   for (int dim = 0; dim < 4; ++dim) {
     76     tensor.setRandom();
     77     tensor = (tensor + tensor.constant(0.5)).log();
     78 
     79     array<DenseIndex, 3> out_shape;
     80     for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
     81 
     82     Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
     83 
     84     array<DenseIndex, 4> ix;
     85     for (int i = 0; i < 2; ++i) {
     86       for (int j = 0; j < 3; ++j) {
     87         for (int k = 0; k < 5; ++k) {
     88           for (int l = 0; l < 7; ++l) {
     89             ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
     90             if (ix[dim] != 0) continue;
     91             // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
     92             tensor(ix) = 10.0;
     93           }
     94         }
     95       }
     96     }
     97 
     98     std::size_t in_bytes = tensor.size() * sizeof(float);
     99     std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
    100 
    101     float* d_in;
    102     DenseIndex* d_out;
    103     cudaMalloc((void**)(&d_in), in_bytes);
    104     cudaMalloc((void**)(&d_out), out_bytes);
    105 
    106     cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
    107 
    108     Eigen::CudaStreamDevice stream;
    109     Eigen::GpuDevice gpu_device(&stream);
    110 
    111     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
    112     Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
    113 
    114     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
    115 
    116     assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    117     assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    118 
    119     VERIFY_IS_EQUAL(tensor_arg.size(),
    120                     size_t(2*3*5*7 / tensor.dimension(dim)));
    121 
    122     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
    123       // Expect max to be in the first index of the reduced dimension
    124       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
    125     }
    126 
    127     for (int i = 0; i < 2; ++i) {
    128       for (int j = 0; j < 3; ++j) {
    129         for (int k = 0; k < 5; ++k) {
    130           for (int l = 0; l < 7; ++l) {
    131             ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
    132             if (ix[dim] != tensor.dimension(dim) - 1) continue;
    133             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
    134             tensor(ix) = 20.0;
    135           }
    136         }
    137       }
    138     }
    139 
    140     cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
    141 
    142     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
    143 
    144     assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    145     assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    146 
    147     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
    148       // Expect max to be in the last index of the reduced dimension
    149       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
    150     }
    151 
    152     cudaFree(d_in);
    153     cudaFree(d_out);
    154   }
    155 }
    156 
    157 template <int DataLayout>
    158 void test_cuda_argmin_dim()
    159 {
    160   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
    161   std::vector<int> dims;
    162   dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
    163 
    164   for (int dim = 0; dim < 4; ++dim) {
    165     tensor.setRandom();
    166     tensor = (tensor + tensor.constant(0.5)).log();
    167 
    168     array<DenseIndex, 3> out_shape;
    169     for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
    170 
    171     Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
    172 
    173     array<DenseIndex, 4> ix;
    174     for (int i = 0; i < 2; ++i) {
    175       for (int j = 0; j < 3; ++j) {
    176         for (int k = 0; k < 5; ++k) {
    177           for (int l = 0; l < 7; ++l) {
    178             ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
    179             if (ix[dim] != 0) continue;
    180             // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
    181             tensor(ix) = -10.0;
    182           }
    183         }
    184       }
    185     }
    186 
    187     std::size_t in_bytes = tensor.size() * sizeof(float);
    188     std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
    189 
    190     float* d_in;
    191     DenseIndex* d_out;
    192     cudaMalloc((void**)(&d_in), in_bytes);
    193     cudaMalloc((void**)(&d_out), out_bytes);
    194 
    195     cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
    196 
    197     Eigen::CudaStreamDevice stream;
    198     Eigen::GpuDevice gpu_device(&stream);
    199 
    200     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
    201     Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
    202 
    203     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
    204 
    205     assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    206     assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    207 
    208     VERIFY_IS_EQUAL(tensor_arg.size(),
    209                     2*3*5*7 / tensor.dimension(dim));
    210 
    211     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
    212       // Expect min to be in the first index of the reduced dimension
    213       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
    214     }
    215 
    216     for (int i = 0; i < 2; ++i) {
    217       for (int j = 0; j < 3; ++j) {
    218         for (int k = 0; k < 5; ++k) {
    219           for (int l = 0; l < 7; ++l) {
    220             ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
    221             if (ix[dim] != tensor.dimension(dim) - 1) continue;
    222             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
    223             tensor(ix) = -20.0;
    224           }
    225         }
    226       }
    227     }
    228 
    229     cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
    230 
    231     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
    232 
    233     assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
    234     assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    235 
    236     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
    237       // Expect max to be in the last index of the reduced dimension
    238       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
    239     }
    240 
    241     cudaFree(d_in);
    242     cudaFree(d_out);
    243   }
    244 }
    245 
    246 void test_cxx11_tensor_cuda()
    247 {
    248   CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
    249   CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
    250   CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
    251   CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
    252   CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
    253   CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
    254 }
    255