1 2 #ifndef EIGEN_TEST_CUDA_COMMON_H 3 #define EIGEN_TEST_CUDA_COMMON_H 4 5 #include <cuda.h> 6 #include <cuda_runtime.h> 7 #include <cuda_runtime_api.h> 8 #include <iostream> 9 10 #ifndef __CUDACC__ 11 dim3 threadIdx, blockDim, blockIdx; 12 #endif 13 14 template<typename Kernel, typename Input, typename Output> 15 void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) 16 { 17 for(int i=0; i<n; i++) 18 ker(i, in.data(), out.data()); 19 } 20 21 22 template<typename Kernel, typename Input, typename Output> 23 __global__ 24 void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) 25 { 26 int i = threadIdx.x + blockIdx.x*blockDim.x; 27 if(i<n) { 28 ker(i, in, out); 29 } 30 } 31 32 33 template<typename Kernel, typename Input, typename Output> 34 void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out) 35 { 36 typename Input::Scalar* d_in; 37 typename Output::Scalar* d_out; 38 std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); 39 std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); 40 41 cudaMalloc((void**)(&d_in), in_bytes); 42 cudaMalloc((void**)(&d_out), out_bytes); 43 44 cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); 45 cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice); 46 47 // Simple and non-optimal 1D mapping assuming n is not too large 48 // That's only for unit testing! 49 dim3 Blocks(128); 50 dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); 51 52 cudaThreadSynchronize(); 53 run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out); 54 cudaThreadSynchronize(); 55 56 // check inputs have not been modified 57 cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, cudaMemcpyDeviceToHost); 58 cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); 59 60 cudaFree(d_in); 61 cudaFree(d_out); 62 } 63 64 65 template<typename Kernel, typename Input, typename Output> 66 void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out) 67 { 68 Input in_ref, in_cuda; 69 Output out_ref, out_cuda; 70 #ifndef __CUDA_ARCH__ 71 in_ref = in_cuda = in; 72 out_ref = out_cuda = out; 73 #endif 74 run_on_cpu (ker, n, in_ref, out_ref); 75 run_on_cuda(ker, n, in_cuda, out_cuda); 76 #ifndef __CUDA_ARCH__ 77 VERIFY_IS_APPROX(in_ref, in_cuda); 78 VERIFY_IS_APPROX(out_ref, out_cuda); 79 #endif 80 } 81 82 83 void ei_test_init_cuda() 84 { 85 int device = 0; 86 cudaDeviceProp deviceProp; 87 cudaGetDeviceProperties(&deviceProp, device); 88 std::cout << "CUDA device info:\n"; 89 std::cout << " name: " << deviceProp.name << "\n"; 90 std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; 91 std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; 92 std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; 93 std::cout << " warpSize: " << deviceProp.warpSize << "\n"; 94 std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; 95 std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; 96 std::cout << " clockRate: " << deviceProp.clockRate << "\n"; 97 std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; 98 std::cout << " computeMode: " << deviceProp.computeMode << "\n"; 99 } 100 101 #endif // EIGEN_TEST_CUDA_COMMON_H 102