1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (a] gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 #define EIGEN_TEST_NO_COMPLEX 12 #define EIGEN_TEST_FUNC cxx11_tensor_device 13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 14 #define EIGEN_USE_GPU 15 16 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 17 #include <cuda_fp16.h> 18 #endif 19 #include "main.h" 20 #include <unsupported/Eigen/CXX11/Tensor> 21 22 using Eigen::Tensor; 23 using Eigen::RowMajor; 24 25 // Context for evaluation on cpu 26 struct CPUContext { 27 CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { 28 kernel_1d_(0) = 3.14f; 29 kernel_1d_(1) = 2.7f; 30 31 kernel_2d_(0,0) = 3.14f; 32 kernel_2d_(1,0) = 2.7f; 33 kernel_2d_(0,1) = 0.2f; 34 kernel_2d_(1,1) = 7.0f; 35 36 kernel_3d_(0,0,0) = 3.14f; 37 kernel_3d_(0,1,0) = 2.7f; 38 kernel_3d_(0,0,1) = 0.2f; 39 kernel_3d_(0,1,1) = 7.0f; 40 kernel_3d_(1,0,0) = -1.0f; 41 kernel_3d_(1,1,0) = -0.3f; 42 kernel_3d_(1,0,1) = -0.7f; 43 kernel_3d_(1,1,1) = -0.5f; 44 } 45 46 const Eigen::DefaultDevice& device() const { return cpu_device_; } 47 48 const Eigen::Tensor<float, 3>& in1() const { return in1_; } 49 const Eigen::Tensor<float, 3>& in2() const { return in2_; } 50 Eigen::Tensor<float, 3>& out() { return out_; } 51 const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; } 52 const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; } 53 const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; } 54 55 private: 56 const Eigen::Tensor<float, 3>& in1_; 57 const Eigen::Tensor<float, 3>& in2_; 58 Eigen::Tensor<float, 3>& out_; 59 60 Eigen::Tensor<float, 1> kernel_1d_; 61 Eigen::Tensor<float, 2> kernel_2d_; 62 Eigen::Tensor<float, 3> kernel_3d_; 63 64 Eigen::DefaultDevice cpu_device_; 65 }; 66 67 68 // Context for evaluation on GPU 69 struct GPUContext { 70 GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { 71 assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); 72 float kernel_1d_val[] = {3.14f, 2.7f}; 73 assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); 74 75 assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); 76 float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; 77 assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); 78 79 assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); 80 float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; 81 assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); 82 } 83 ~GPUContext() { 84 assert(cudaFree(kernel_1d_) == cudaSuccess); 85 assert(cudaFree(kernel_2d_) == cudaSuccess); 86 assert(cudaFree(kernel_3d_) == cudaSuccess); 87 } 88 89 const Eigen::GpuDevice& device() const { return gpu_device_; } 90 91 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; } 92 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; } 93 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; } 94 Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); } 95 Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); } 96 Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); } 97 98 private: 99 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_; 100 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_; 101 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_; 102 103 float* kernel_1d_; 104 float* kernel_2d_; 105 float* kernel_3d_; 106 107 Eigen::CudaStreamDevice stream_; 108 Eigen::GpuDevice gpu_device_; 109 }; 110 111 112 // The actual expression to evaluate 113 template <typename Context> 114 void test_contextual_eval(Context* context) 115 { 116 context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); 117 } 118 119 template <typename Context> 120 void test_forced_contextual_eval(Context* context) 121 { 122 context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); 123 } 124 125 template <typename Context> 126 void test_compound_assignment(Context* context) 127 { 128 context->out().device(context->device()) = context->in1().constant(2.718f); 129 context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; 130 } 131 132 133 template <typename Context> 134 void test_contraction(Context* context) 135 { 136 Eigen::array<std::pair<int, int>, 2> dims; 137 dims[0] = std::make_pair(1, 1); 138 dims[1] = std::make_pair(2, 2); 139 140 Eigen::array<int, 2> shape(40, 50*70); 141 142 Eigen::DSizes<int, 2> indices(0,0); 143 Eigen::DSizes<int, 2> sizes(40,40); 144 145 context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); 146 } 147 148 149 template <typename Context> 150 void test_1d_convolution(Context* context) 151 { 152 Eigen::DSizes<int, 3> indices(0,0,0); 153 Eigen::DSizes<int, 3> sizes(40,49,70); 154 155 Eigen::array<int, 1> dims(1); 156 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); 157 } 158 159 template <typename Context> 160 void test_2d_convolution(Context* context) 161 { 162 Eigen::DSizes<int, 3> indices(0,0,0); 163 Eigen::DSizes<int, 3> sizes(40,49,69); 164 165 Eigen::array<int, 2> dims(1,2); 166 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); 167 } 168 169 template <typename Context> 170 void test_3d_convolution(Context* context) 171 { 172 Eigen::DSizes<int, 3> indices(0,0,0); 173 Eigen::DSizes<int, 3> sizes(39,49,69); 174 175 Eigen::array<int, 3> dims(0,1,2); 176 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); 177 } 178 179 180 void test_cpu() { 181 Eigen::Tensor<float, 3> in1(40,50,70); 182 Eigen::Tensor<float, 3> in2(40,50,70); 183 Eigen::Tensor<float, 3> out(40,50,70); 184 185 in1 = in1.random() + in1.constant(10.0f); 186 in2 = in2.random() + in2.constant(10.0f); 187 188 CPUContext context(in1, in2, out); 189 test_contextual_eval(&context); 190 for (int i = 0; i < 40; ++i) { 191 for (int j = 0; j < 50; ++j) { 192 for (int k = 0; k < 70; ++k) { 193 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 194 } 195 } 196 } 197 198 test_forced_contextual_eval(&context); 199 for (int i = 0; i < 40; ++i) { 200 for (int j = 0; j < 50; ++j) { 201 for (int k = 0; k < 70; ++k) { 202 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); 203 } 204 } 205 } 206 207 test_compound_assignment(&context); 208 for (int i = 0; i < 40; ++i) { 209 for (int j = 0; j < 50; ++j) { 210 for (int k = 0; k < 70; ++k) { 211 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 212 } 213 } 214 } 215 216 test_contraction(&context); 217 for (int i = 0; i < 40; ++i) { 218 for (int j = 0; j < 40; ++j) { 219 const float result = out(i,j,0); 220 float expected = 0; 221 for (int k = 0; k < 50; ++k) { 222 for (int l = 0; l < 70; ++l) { 223 expected += in1(i, k, l) * in2(j, k, l); 224 } 225 } 226 VERIFY_IS_APPROX(expected, result); 227 } 228 } 229 230 test_1d_convolution(&context); 231 for (int i = 0; i < 40; ++i) { 232 for (int j = 0; j < 49; ++j) { 233 for (int k = 0; k < 70; ++k) { 234 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); 235 } 236 } 237 } 238 239 test_2d_convolution(&context); 240 for (int i = 0; i < 40; ++i) { 241 for (int j = 0; j < 49; ++j) { 242 for (int k = 0; k < 69; ++k) { 243 const float result = out(i,j,k); 244 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + 245 (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); 246 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { 247 continue; 248 } 249 VERIFY_IS_APPROX(expected, result); 250 } 251 } 252 } 253 254 test_3d_convolution(&context); 255 for (int i = 0; i < 39; ++i) { 256 for (int j = 0; j < 49; ++j) { 257 for (int k = 0; k < 69; ++k) { 258 const float result = out(i,j,k); 259 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + 260 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + 261 (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + 262 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); 263 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { 264 continue; 265 } 266 VERIFY_IS_APPROX(expected, result); 267 } 268 } 269 } 270 } 271 272 void test_gpu() { 273 Eigen::Tensor<float, 3> in1(40,50,70); 274 Eigen::Tensor<float, 3> in2(40,50,70); 275 Eigen::Tensor<float, 3> out(40,50,70); 276 in1 = in1.random() + in1.constant(10.0f); 277 in2 = in2.random() + in2.constant(10.0f); 278 279 std::size_t in1_bytes = in1.size() * sizeof(float); 280 std::size_t in2_bytes = in2.size() * sizeof(float); 281 std::size_t out_bytes = out.size() * sizeof(float); 282 283 float* d_in1; 284 float* d_in2; 285 float* d_out; 286 cudaMalloc((void**)(&d_in1), in1_bytes); 287 cudaMalloc((void**)(&d_in2), in2_bytes); 288 cudaMalloc((void**)(&d_out), out_bytes); 289 290 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); 291 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); 292 293 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); 294 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); 295 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70); 296 297 GPUContext context(gpu_in1, gpu_in2, gpu_out); 298 test_contextual_eval(&context); 299 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); 300 for (int i = 0; i < 40; ++i) { 301 for (int j = 0; j < 50; ++j) { 302 for (int k = 0; k < 70; ++k) { 303 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 304 } 305 } 306 } 307 308 test_forced_contextual_eval(&context); 309 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); 310 for (int i = 0; i < 40; ++i) { 311 for (int j = 0; j < 50; ++j) { 312 for (int k = 0; k < 70; ++k) { 313 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); 314 } 315 } 316 } 317 318 test_compound_assignment(&context); 319 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); 320 for (int i = 0; i < 40; ++i) { 321 for (int j = 0; j < 50; ++j) { 322 for (int k = 0; k < 70; ++k) { 323 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); 324 } 325 } 326 } 327 328 test_contraction(&context); 329 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); 330 for (int i = 0; i < 40; ++i) { 331 for (int j = 0; j < 40; ++j) { 332 const float result = out(i,j,0); 333 float expected = 0; 334 for (int k = 0; k < 50; ++k) { 335 for (int l = 0; l < 70; ++l) { 336 expected += in1(i, k, l) * in2(j, k, l); 337 } 338 } 339 VERIFY_IS_APPROX(expected, result); 340 } 341 } 342 343 test_1d_convolution(&context); 344 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); 345 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); 346 for (int i = 0; i < 40; ++i) { 347 for (int j = 0; j < 49; ++j) { 348 for (int k = 0; k < 70; ++k) { 349 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); 350 } 351 } 352 } 353 354 test_2d_convolution(&context); 355 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); 356 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); 357 for (int i = 0; i < 40; ++i) { 358 for (int j = 0; j < 49; ++j) { 359 for (int k = 0; k < 69; ++k) { 360 const float result = out(i,j,k); 361 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + 362 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); 363 VERIFY_IS_APPROX(expected, result); 364 } 365 } 366 } 367 368 test_3d_convolution(&context); 369 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); 370 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); 371 for (int i = 0; i < 39; ++i) { 372 for (int j = 0; j < 49; ++j) { 373 for (int k = 0; k < 69; ++k) { 374 const float result = out(i,j,k); 375 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + 376 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + 377 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + 378 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); 379 VERIFY_IS_APPROX(expected, result); 380 } 381 } 382 } 383 } 384 385 386 void test_cxx11_tensor_device() 387 { 388 CALL_SUBTEST_1(test_cpu()); 389 CALL_SUBTEST_2(test_gpu()); 390 } 391