1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog (a] gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 #define EIGEN_TEST_NO_COMPLEX 12 #define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda 13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 14 #define EIGEN_USE_GPU 15 16 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 17 #include <cuda_fp16.h> 18 #endif 19 #include "main.h" 20 #include <unsupported/Eigen/CXX11/Tensor> 21 22 using Eigen::Tensor; 23 24 template<typename> 25 void test_cuda_numext() { 26 Eigen::CudaStreamDevice stream; 27 Eigen::GpuDevice gpu_device(&stream); 28 int num_elem = 101; 29 30 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 31 bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); 32 bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); 33 34 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( 35 d_float, num_elem); 36 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half( 37 d_res_half, num_elem); 38 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float( 39 d_res_float, num_elem); 40 41 gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); 42 gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>()); 43 gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>()); 44 45 Tensor<bool, 1> half_prec(num_elem); 46 Tensor<bool, 1> full_prec(num_elem); 47 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool)); 48 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool)); 49 gpu_device.synchronize(); 50 51 for (int i = 0; i < num_elem; ++i) { 52 std::cout << "Checking numext " << i << std::endl; 53 VERIFY_IS_EQUAL(full_prec(i), half_prec(i)); 54 } 55 56 gpu_device.deallocate(d_float); 57 gpu_device.deallocate(d_res_half); 58 gpu_device.deallocate(d_res_float); 59 } 60 61 62 #ifdef EIGEN_HAS_CUDA_FP16 63 64 template<typename> 65 void test_cuda_conversion() { 66 Eigen::CudaStreamDevice stream; 67 Eigen::GpuDevice gpu_device(&stream); 68 int num_elem = 101; 69 70 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 71 Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 72 float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); 73 74 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( 75 d_float, num_elem); 76 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( 77 d_half, num_elem); 78 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( 79 d_conv, num_elem); 80 81 gpu_float.device(gpu_device) = gpu_float.random(); 82 gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); 83 gpu_conv.device(gpu_device) = gpu_half.cast<float>(); 84 85 Tensor<float, 1> initial(num_elem); 86 Tensor<float, 1> final(num_elem); 87 gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); 88 gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); 89 90 for (int i = 0; i < num_elem; ++i) { 91 VERIFY_IS_APPROX(initial(i), final(i)); 92 } 93 94 gpu_device.deallocate(d_float); 95 gpu_device.deallocate(d_half); 96 gpu_device.deallocate(d_conv); 97 } 98 99 template<typename> 100 void test_cuda_unary() { 101 Eigen::CudaStreamDevice stream; 102 Eigen::GpuDevice gpu_device(&stream); 103 int num_elem = 101; 104 105 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 106 float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); 107 float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 108 109 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( 110 d_float, num_elem); 111 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( 112 d_res_half, num_elem); 113 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( 114 d_res_float, num_elem); 115 116 gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); 117 gpu_res_float.device(gpu_device) = gpu_float.abs(); 118 gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>(); 119 120 Tensor<float, 1> half_prec(num_elem); 121 Tensor<float, 1> full_prec(num_elem); 122 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); 123 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); 124 gpu_device.synchronize(); 125 126 for (int i = 0; i < num_elem; ++i) { 127 std::cout << "Checking unary " << i << std::endl; 128 VERIFY_IS_APPROX(full_prec(i), half_prec(i)); 129 } 130 131 gpu_device.deallocate(d_float); 132 gpu_device.deallocate(d_res_half); 133 gpu_device.deallocate(d_res_float); 134 } 135 136 template<typename> 137 void test_cuda_elementwise() { 138 Eigen::CudaStreamDevice stream; 139 Eigen::GpuDevice gpu_device(&stream); 140 int num_elem = 101; 141 142 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 143 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 144 float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); 145 float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 146 147 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1( 148 d_float1, num_elem); 149 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2( 150 d_float2, num_elem); 151 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( 152 d_res_half, num_elem); 153 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( 154 d_res_float, num_elem); 155 156 gpu_float1.device(gpu_device) = gpu_float1.random(); 157 gpu_float2.device(gpu_device) = gpu_float2.random(); 158 gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; 159 gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>(); 160 161 Tensor<float, 1> half_prec(num_elem); 162 Tensor<float, 1> full_prec(num_elem); 163 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); 164 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); 165 gpu_device.synchronize(); 166 167 for (int i = 0; i < num_elem; ++i) { 168 std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl; 169 VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i))); 170 } 171 172 gpu_device.deallocate(d_float1); 173 gpu_device.deallocate(d_float2); 174 gpu_device.deallocate(d_res_half); 175 gpu_device.deallocate(d_res_float); 176 } 177 178 template<typename> 179 void test_cuda_trancendental() { 180 Eigen::CudaStreamDevice stream; 181 Eigen::GpuDevice gpu_device(&stream); 182 int num_elem = 101; 183 184 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 185 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 186 float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 187 Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 188 Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 189 Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 190 Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 191 Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 192 Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 193 194 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem); 195 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem); 196 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem); 197 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); 198 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); 199 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); 200 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); 201 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); 202 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); 203 204 gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); 205 gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); 206 gpu_float3.device(gpu_device) = gpu_float3.random(); 207 gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); 208 gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); 209 gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); 210 211 gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); 212 gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); 213 214 gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>(); 215 gpu_res2_half.device(gpu_device) = gpu_res2_half.log(); 216 217 gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); 218 gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); 219 220 Tensor<float, 1> input1(num_elem); 221 Tensor<Eigen::half, 1> half_prec1(num_elem); 222 Tensor<Eigen::half, 1> full_prec1(num_elem); 223 Tensor<float, 1> input2(num_elem); 224 Tensor<Eigen::half, 1> half_prec2(num_elem); 225 Tensor<Eigen::half, 1> full_prec2(num_elem); 226 Tensor<float, 1> input3(num_elem); 227 Tensor<Eigen::half, 1> half_prec3(num_elem); 228 Tensor<Eigen::half, 1> full_prec3(num_elem); 229 gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); 230 gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); 231 gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); 232 gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); 233 gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); 234 gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); 235 gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); 236 gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); 237 gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); 238 gpu_device.synchronize(); 239 240 for (int i = 0; i < num_elem; ++i) { 241 std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl; 242 VERIFY_IS_APPROX(full_prec1(i), half_prec1(i)); 243 } 244 for (int i = 0; i < num_elem; ++i) { 245 std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; 246 if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1 247 VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); 248 else 249 VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); 250 } 251 for (int i = 0; i < num_elem; ++i) { 252 std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; 253 VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); 254 } 255 gpu_device.deallocate(d_float1); 256 gpu_device.deallocate(d_float2); 257 gpu_device.deallocate(d_float3); 258 gpu_device.deallocate(d_res1_half); 259 gpu_device.deallocate(d_res1_float); 260 gpu_device.deallocate(d_res2_half); 261 gpu_device.deallocate(d_res2_float); 262 gpu_device.deallocate(d_res3_float); 263 gpu_device.deallocate(d_res3_half); 264 } 265 266 template<typename> 267 void test_cuda_contractions() { 268 Eigen::CudaStreamDevice stream; 269 Eigen::GpuDevice gpu_device(&stream); 270 int rows = 23; 271 int cols = 23; 272 int num_elem = rows*cols; 273 274 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 275 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 276 Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 277 Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); 278 279 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( 280 d_float1, rows, cols); 281 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( 282 d_float2, rows, cols); 283 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half( 284 d_res_half, rows, cols); 285 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float( 286 d_res_float, rows, cols); 287 288 gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); 289 gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f); 290 291 typedef Tensor<float, 2>::DimensionPair DimPair; 292 Eigen::array<DimPair, 1> dims(DimPair(1, 0)); 293 gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>(); 294 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims); 295 296 Tensor<Eigen::half, 2> half_prec(rows, cols); 297 Tensor<Eigen::half, 2> full_prec(rows, cols); 298 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half)); 299 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half)); 300 gpu_device.synchronize(); 301 302 for (int i = 0; i < rows; ++i) { 303 for (int j = 0; j < cols; ++j) { 304 std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl; 305 if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) { 306 VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); 307 } 308 } 309 } 310 311 gpu_device.deallocate(d_float1); 312 gpu_device.deallocate(d_float2); 313 gpu_device.deallocate(d_res_half); 314 gpu_device.deallocate(d_res_float); 315 } 316 317 template<typename> 318 void test_cuda_reductions(int size1, int size2, int redux) { 319 320 std::cout << "Reducing " << size1 << " by " << size2 321 << " tensor along dim " << redux << std::endl; 322 323 Eigen::CudaStreamDevice stream; 324 Eigen::GpuDevice gpu_device(&stream); 325 int num_elem = size1*size2; 326 int result_size = (redux == 1 ? size1 : size2); 327 328 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 329 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 330 Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); 331 Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); 332 333 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( 334 d_float1, size1, size2); 335 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( 336 d_float2, size1, size2); 337 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half( 338 d_res_half, result_size); 339 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float( 340 d_res_float, result_size); 341 342 gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; 343 gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; 344 345 Eigen::array<int, 1> redux_dim = {{redux}}; 346 gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>(); 347 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim); 348 349 Tensor<Eigen::half, 1> half_prec(result_size); 350 Tensor<Eigen::half, 1> full_prec(result_size); 351 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half)); 352 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half)); 353 gpu_device.synchronize(); 354 355 for (int i = 0; i < result_size; ++i) { 356 std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl; 357 VERIFY_IS_APPROX(full_prec(i), half_prec(i)); 358 } 359 360 gpu_device.deallocate(d_float1); 361 gpu_device.deallocate(d_float2); 362 gpu_device.deallocate(d_res_half); 363 gpu_device.deallocate(d_res_float); 364 } 365 366 template<typename> 367 void test_cuda_reductions() { 368 test_cuda_reductions<void>(13, 13, 0); 369 test_cuda_reductions<void>(13, 13, 1); 370 371 test_cuda_reductions<void>(35, 36, 0); 372 test_cuda_reductions<void>(35, 36, 1); 373 374 test_cuda_reductions<void>(36, 35, 0); 375 test_cuda_reductions<void>(36, 35, 1); 376 } 377 378 template<typename> 379 void test_cuda_full_reductions() { 380 Eigen::CudaStreamDevice stream; 381 Eigen::GpuDevice gpu_device(&stream); 382 int size = 13; 383 int num_elem = size*size; 384 385 float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 386 float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 387 Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); 388 Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); 389 390 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( 391 d_float1, size, size); 392 Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( 393 d_float2, size, size); 394 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half( 395 d_res_half); 396 Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float( 397 d_res_float); 398 399 gpu_float1.device(gpu_device) = gpu_float1.random(); 400 gpu_float2.device(gpu_device) = gpu_float2.random(); 401 402 gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>(); 403 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(); 404 405 Tensor<Eigen::half, 0> half_prec; 406 Tensor<Eigen::half, 0> full_prec; 407 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); 408 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); 409 gpu_device.synchronize(); 410 411 VERIFY_IS_APPROX(full_prec(), half_prec()); 412 413 gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>(); 414 gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum(); 415 gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); 416 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); 417 gpu_device.synchronize(); 418 419 VERIFY_IS_APPROX(full_prec(), half_prec()); 420 421 gpu_device.deallocate(d_float1); 422 gpu_device.deallocate(d_float2); 423 gpu_device.deallocate(d_res_half); 424 gpu_device.deallocate(d_res_float); 425 } 426 427 template<typename> 428 void test_cuda_forced_evals() { 429 430 Eigen::CudaStreamDevice stream; 431 Eigen::GpuDevice gpu_device(&stream); 432 int num_elem = 101; 433 434 float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 435 float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 436 float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); 437 float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); 438 439 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( 440 d_float, num_elem); 441 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1( 442 d_res_half1, num_elem); 443 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2( 444 d_res_half2, num_elem); 445 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( 446 d_res_float, num_elem); 447 448 Eigen::array<int, 1> no_bcast; 449 no_bcast[0] = 1; 450 451 gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); 452 gpu_res_float.device(gpu_device) = gpu_float.abs(); 453 gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>(); 454 gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>(); 455 456 Tensor<float, 1> half_prec1(num_elem); 457 Tensor<float, 1> half_prec2(num_elem); 458 Tensor<float, 1> full_prec(num_elem); 459 gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); 460 gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float)); 461 gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); 462 gpu_device.synchronize(); 463 464 for (int i = 0; i < num_elem; ++i) { 465 std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl; 466 VERIFY_IS_APPROX(full_prec(i), half_prec1(i)); 467 VERIFY_IS_APPROX(full_prec(i), half_prec2(i)); 468 } 469 470 gpu_device.deallocate(d_float); 471 gpu_device.deallocate(d_res_half1); 472 gpu_device.deallocate(d_res_half2); 473 gpu_device.deallocate(d_res_float); 474 } 475 #endif 476 477 478 void test_cxx11_tensor_of_float16_cuda() 479 { 480 CALL_SUBTEST_1(test_cuda_numext<void>()); 481 482 #ifdef EIGEN_HAS_CUDA_FP16 483 CALL_SUBTEST_1(test_cuda_conversion<void>()); 484 CALL_SUBTEST_1(test_cuda_unary<void>()); 485 CALL_SUBTEST_1(test_cuda_elementwise<void>()); 486 CALL_SUBTEST_1(test_cuda_trancendental<void>()); 487 CALL_SUBTEST_2(test_cuda_contractions<void>()); 488 CALL_SUBTEST_3(test_cuda_reductions<void>()); 489 CALL_SUBTEST_4(test_cuda_full_reductions<void>()); 490 CALL_SUBTEST_5(test_cuda_forced_evals<void>()); 491 #else 492 std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; 493 #endif 494 } 495