Home | History | Annotate | Download | only in test
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog (a] gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 #define EIGEN_TEST_NO_COMPLEX
     12 #define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
     13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
     14 #define EIGEN_USE_GPU
     15 
     16 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     17 #include <cuda_fp16.h>
     18 #endif
     19 #include "main.h"
     20 #include <unsupported/Eigen/CXX11/Tensor>
     21 
     22 using Eigen::Tensor;
     23 
     24 template<typename>
     25 void test_cuda_numext() {
     26   Eigen::CudaStreamDevice stream;
     27   Eigen::GpuDevice gpu_device(&stream);
     28   int num_elem = 101;
     29 
     30   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
     31   bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
     32   bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
     33 
     34   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
     35       d_float, num_elem);
     36   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
     37       d_res_half, num_elem);
     38   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
     39       d_res_float, num_elem);
     40 
     41   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
     42   gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
     43   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
     44 
     45   Tensor<bool, 1> half_prec(num_elem);
     46   Tensor<bool, 1> full_prec(num_elem);
     47   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
     48   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
     49   gpu_device.synchronize();
     50 
     51   for (int i = 0; i < num_elem; ++i) {
     52     std::cout << "Checking numext " << i << std::endl;
     53     VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
     54   }
     55 
     56   gpu_device.deallocate(d_float);
     57   gpu_device.deallocate(d_res_half);
     58   gpu_device.deallocate(d_res_float);
     59 }
     60 
     61 
     62 #ifdef EIGEN_HAS_CUDA_FP16
     63 
     64 template<typename>
     65 void test_cuda_conversion() {
     66   Eigen::CudaStreamDevice stream;
     67   Eigen::GpuDevice gpu_device(&stream);
     68   int num_elem = 101;
     69   
     70   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
     71   Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
     72   float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
     73 
     74   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
     75       d_float, num_elem);
     76   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
     77       d_half, num_elem);
     78   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
     79       d_conv, num_elem);
     80 
     81   gpu_float.device(gpu_device) = gpu_float.random();
     82   gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
     83   gpu_conv.device(gpu_device) = gpu_half.cast<float>();
     84 
     85   Tensor<float, 1> initial(num_elem);
     86   Tensor<float, 1> final(num_elem);
     87   gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
     88   gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
     89 
     90   for (int i = 0; i < num_elem; ++i) {
     91     VERIFY_IS_APPROX(initial(i), final(i));
     92   }
     93 
     94   gpu_device.deallocate(d_float);
     95   gpu_device.deallocate(d_half);
     96   gpu_device.deallocate(d_conv);
     97 }
     98 
     99 template<typename>
    100 void test_cuda_unary() {
    101   Eigen::CudaStreamDevice stream;
    102   Eigen::GpuDevice gpu_device(&stream);
    103   int num_elem = 101;
    104 
    105   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    106   float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
    107   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    108 
    109   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
    110       d_float, num_elem);
    111   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
    112       d_res_half, num_elem);
    113   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
    114       d_res_float, num_elem);
    115 
    116   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
    117   gpu_res_float.device(gpu_device) = gpu_float.abs();
    118   gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
    119 
    120   Tensor<float, 1> half_prec(num_elem);
    121   Tensor<float, 1> full_prec(num_elem);
    122   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
    123   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
    124   gpu_device.synchronize();
    125 
    126   for (int i = 0; i < num_elem; ++i) {
    127     std::cout << "Checking unary " << i << std::endl;
    128     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
    129   }
    130 
    131   gpu_device.deallocate(d_float);
    132   gpu_device.deallocate(d_res_half);
    133   gpu_device.deallocate(d_res_float);
    134 }
    135 
    136 template<typename>
    137 void test_cuda_elementwise() {
    138   Eigen::CudaStreamDevice stream;
    139   Eigen::GpuDevice gpu_device(&stream);
    140   int num_elem = 101;
    141 
    142   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    143   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    144   float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
    145   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    146 
    147   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
    148       d_float1, num_elem);
    149   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
    150       d_float2, num_elem);
    151   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
    152       d_res_half, num_elem);
    153   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
    154       d_res_float, num_elem);
    155 
    156   gpu_float1.device(gpu_device) = gpu_float1.random();
    157   gpu_float2.device(gpu_device) = gpu_float2.random();
    158   gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
    159   gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
    160 
    161   Tensor<float, 1> half_prec(num_elem);
    162   Tensor<float, 1> full_prec(num_elem);
    163   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
    164   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
    165   gpu_device.synchronize();
    166 
    167   for (int i = 0; i < num_elem; ++i) {
    168     std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
    169     VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
    170   }
    171 
    172   gpu_device.deallocate(d_float1);
    173   gpu_device.deallocate(d_float2);
    174   gpu_device.deallocate(d_res_half);
    175   gpu_device.deallocate(d_res_float);
    176 }
    177 
    178 template<typename>
    179 void test_cuda_trancendental() {
    180   Eigen::CudaStreamDevice stream;
    181   Eigen::GpuDevice gpu_device(&stream);
    182   int num_elem = 101;
    183 
    184   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    185   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    186   float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    187   Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    188   Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    189   Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    190   Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    191   Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    192   Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    193 
    194   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
    195   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
    196   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
    197   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
    198   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
    199   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
    200   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
    201   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
    202   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
    203 
    204   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
    205   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
    206   gpu_float3.device(gpu_device) = gpu_float3.random();
    207   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
    208   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
    209   gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
    210 
    211   gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
    212   gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
    213 
    214   gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
    215   gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
    216 
    217   gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
    218   gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
    219 
    220   Tensor<float, 1> input1(num_elem);
    221   Tensor<Eigen::half, 1> half_prec1(num_elem);
    222   Tensor<Eigen::half, 1> full_prec1(num_elem);
    223   Tensor<float, 1> input2(num_elem);
    224   Tensor<Eigen::half, 1> half_prec2(num_elem);
    225   Tensor<Eigen::half, 1> full_prec2(num_elem);
    226   Tensor<float, 1> input3(num_elem);
    227   Tensor<Eigen::half, 1> half_prec3(num_elem);
    228   Tensor<Eigen::half, 1> full_prec3(num_elem);
    229   gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
    230   gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
    231   gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
    232   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
    233   gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
    234   gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
    235   gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
    236   gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
    237   gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
    238   gpu_device.synchronize();
    239 
    240   for (int i = 0; i < num_elem; ++i) {
    241     std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
    242     VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
    243   }
    244   for (int i = 0; i < num_elem; ++i) {
    245     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
    246     if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
    247       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
    248     else
    249       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
    250   }
    251   for (int i = 0; i < num_elem; ++i) {
    252     std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
    253     VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
    254   }
    255   gpu_device.deallocate(d_float1);
    256   gpu_device.deallocate(d_float2);
    257   gpu_device.deallocate(d_float3);
    258   gpu_device.deallocate(d_res1_half);
    259   gpu_device.deallocate(d_res1_float);
    260   gpu_device.deallocate(d_res2_half);
    261   gpu_device.deallocate(d_res2_float);
    262   gpu_device.deallocate(d_res3_float);
    263   gpu_device.deallocate(d_res3_half);
    264 }
    265 
    266 template<typename>
    267 void test_cuda_contractions() {
    268   Eigen::CudaStreamDevice stream;
    269   Eigen::GpuDevice gpu_device(&stream);
    270   int rows = 23;
    271   int cols = 23;
    272   int num_elem = rows*cols;
    273 
    274   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    275   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    276   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    277   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
    278 
    279   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
    280       d_float1, rows, cols);
    281   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
    282       d_float2, rows, cols);
    283   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
    284       d_res_half, rows, cols);
    285   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
    286       d_res_float, rows, cols);
    287 
    288   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
    289   gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
    290 
    291   typedef Tensor<float, 2>::DimensionPair DimPair;
    292   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
    293   gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
    294   gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
    295 
    296   Tensor<Eigen::half, 2> half_prec(rows, cols);
    297   Tensor<Eigen::half, 2> full_prec(rows, cols);
    298   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
    299   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
    300   gpu_device.synchronize();
    301 
    302   for (int i = 0; i < rows; ++i) {
    303     for (int j = 0; j < cols; ++j) {
    304       std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
    305       if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
    306         VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
    307       }
    308     }
    309   }
    310 
    311   gpu_device.deallocate(d_float1);
    312   gpu_device.deallocate(d_float2);
    313   gpu_device.deallocate(d_res_half);
    314   gpu_device.deallocate(d_res_float);
    315 }
    316 
    317 template<typename>
    318 void test_cuda_reductions(int size1, int size2, int redux) {
    319 
    320    std::cout << "Reducing " << size1 << " by " << size2
    321              << " tensor along dim " << redux << std::endl; 
    322 
    323   Eigen::CudaStreamDevice stream;
    324   Eigen::GpuDevice gpu_device(&stream);
    325   int num_elem = size1*size2;
    326   int result_size = (redux == 1 ? size1 : size2);
    327 
    328   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    329   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    330   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
    331   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
    332 
    333   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
    334       d_float1, size1, size2);
    335   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
    336       d_float2, size1, size2);
    337   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
    338       d_res_half, result_size);
    339   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
    340       d_res_float, result_size);
    341 
    342   gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
    343   gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
    344 
    345   Eigen::array<int, 1> redux_dim = {{redux}};
    346   gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
    347   gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
    348 
    349   Tensor<Eigen::half, 1> half_prec(result_size);
    350   Tensor<Eigen::half, 1> full_prec(result_size);
    351   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
    352   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
    353   gpu_device.synchronize();
    354 
    355   for (int i = 0; i < result_size; ++i) {
    356     std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
    357     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
    358   }
    359 
    360   gpu_device.deallocate(d_float1);
    361   gpu_device.deallocate(d_float2);
    362   gpu_device.deallocate(d_res_half);
    363   gpu_device.deallocate(d_res_float);
    364 }
    365 
    366 template<typename>
    367 void test_cuda_reductions() {
    368   test_cuda_reductions<void>(13, 13, 0);
    369   test_cuda_reductions<void>(13, 13, 1);
    370 
    371   test_cuda_reductions<void>(35, 36, 0);
    372   test_cuda_reductions<void>(35, 36, 1);
    373 
    374   test_cuda_reductions<void>(36, 35, 0);
    375   test_cuda_reductions<void>(36, 35, 1);
    376 }
    377 
    378 template<typename>
    379 void test_cuda_full_reductions() {
    380   Eigen::CudaStreamDevice stream;
    381   Eigen::GpuDevice gpu_device(&stream);
    382   int size = 13;
    383   int num_elem = size*size;
    384 
    385   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    386   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    387   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
    388   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
    389 
    390   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
    391       d_float1, size, size);
    392   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
    393       d_float2, size, size);
    394   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
    395       d_res_half);
    396   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
    397       d_res_float);
    398 
    399   gpu_float1.device(gpu_device) = gpu_float1.random();
    400   gpu_float2.device(gpu_device) = gpu_float2.random();
    401 
    402   gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
    403   gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
    404 
    405   Tensor<Eigen::half, 0> half_prec;
    406   Tensor<Eigen::half, 0> full_prec;
    407   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
    408   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
    409   gpu_device.synchronize();
    410 
    411   VERIFY_IS_APPROX(full_prec(), half_prec());
    412 
    413   gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
    414   gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
    415   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
    416   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
    417   gpu_device.synchronize();
    418 
    419   VERIFY_IS_APPROX(full_prec(), half_prec());
    420 
    421   gpu_device.deallocate(d_float1);
    422   gpu_device.deallocate(d_float2);
    423   gpu_device.deallocate(d_res_half);
    424   gpu_device.deallocate(d_res_float);
    425 }
    426 
    427 template<typename>
    428 void test_cuda_forced_evals() {
    429 
    430   Eigen::CudaStreamDevice stream;
    431   Eigen::GpuDevice gpu_device(&stream);
    432   int num_elem = 101;
    433 
    434   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    435   float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    436   float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
    437   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
    438 
    439   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
    440       d_float, num_elem);
    441   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
    442       d_res_half1, num_elem);
    443  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
    444       d_res_half2, num_elem);
    445   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
    446       d_res_float, num_elem);
    447 
    448   Eigen::array<int, 1> no_bcast;
    449   no_bcast[0] = 1;
    450 
    451   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
    452   gpu_res_float.device(gpu_device) = gpu_float.abs();
    453   gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
    454   gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
    455 
    456   Tensor<float, 1> half_prec1(num_elem);
    457   Tensor<float, 1> half_prec2(num_elem);
    458   Tensor<float, 1> full_prec(num_elem);
    459   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
    460   gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
    461   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
    462   gpu_device.synchronize();
    463 
    464   for (int i = 0; i < num_elem; ++i) {
    465     std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
    466     VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
    467     VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
    468   }
    469 
    470   gpu_device.deallocate(d_float);
    471   gpu_device.deallocate(d_res_half1);
    472   gpu_device.deallocate(d_res_half2);
    473   gpu_device.deallocate(d_res_float);
    474 }
    475 #endif
    476 
    477 
    478 void test_cxx11_tensor_of_float16_cuda()
    479 {
    480   CALL_SUBTEST_1(test_cuda_numext<void>());
    481 
    482 #ifdef EIGEN_HAS_CUDA_FP16
    483   CALL_SUBTEST_1(test_cuda_conversion<void>());
    484   CALL_SUBTEST_1(test_cuda_unary<void>());
    485   CALL_SUBTEST_1(test_cuda_elementwise<void>());
    486   CALL_SUBTEST_1(test_cuda_trancendental<void>());
    487   CALL_SUBTEST_2(test_cuda_contractions<void>());
    488   CALL_SUBTEST_3(test_cuda_reductions<void>());
    489   CALL_SUBTEST_4(test_cuda_full_reductions<void>());
    490   CALL_SUBTEST_5(test_cuda_forced_evals<void>());
    491 #else
    492   std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
    493 #endif
    494 }
    495