1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (a] gmail.com> 5 // Copyright (C) 2014 Navdeep Jaitly <ndjaitly (a] google.com> 6 // 7 // This Source Code Form is subject to the terms of the Mozilla 8 // Public License v. 2.0. If a copy of the MPL was not distributed 9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 10 11 #define EIGEN_TEST_NO_LONGDOUBLE 12 #define EIGEN_TEST_NO_COMPLEX 13 #define EIGEN_TEST_FUNC cxx11_tensor_cuda 14 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 15 #define EIGEN_USE_GPU 16 17 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 18 #include <cuda_fp16.h> 19 #endif 20 #include "main.h" 21 #include <unsupported/Eigen/CXX11/Tensor> 22 23 using Eigen::Tensor; 24 typedef Tensor<float, 1>::DimensionPair DimPair; 25 26 template<int DataLayout> 27 void test_cuda_contraction(int m_size, int k_size, int n_size) 28 { 29 std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; 30 // with these dimensions, the output has 300 * 140 elements, which is 31 // more than 30 * 1024, which is the number of threads in blocks on 32 // a 15 SM GK110 GPU 33 Tensor<float, 2, DataLayout> t_left(m_size, k_size); 34 Tensor<float, 2, DataLayout> t_right(k_size, n_size); 35 Tensor<float, 2, DataLayout> t_result(m_size, n_size); 36 Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); 37 Eigen::array<DimPair, 1> dims(DimPair(1, 0)); 38 39 t_left.setRandom(); 40 t_right.setRandom(); 41 42 std::size_t t_left_bytes = t_left.size() * sizeof(float); 43 std::size_t t_right_bytes = t_right.size() * sizeof(float); 44 std::size_t t_result_bytes = t_result.size() * sizeof(float); 45 46 float* d_t_left; 47 float* d_t_right; 48 float* d_t_result; 49 50 cudaMalloc((void**)(&d_t_left), t_left_bytes); 51 cudaMalloc((void**)(&d_t_right), t_right_bytes); 52 cudaMalloc((void**)(&d_t_result), t_result_bytes); 53 54 cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); 55 cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); 56 57 Eigen::CudaStreamDevice stream; 58 Eigen::GpuDevice gpu_device(&stream); 59 60 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 61 gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size)); 62 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 63 gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size)); 64 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 65 gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size)); 66 67 68 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); 69 t_result = t_left.contract(t_right, dims); 70 71 cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); 72 for (DenseIndex i = 0; i < t_result.size(); i++) { 73 if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { 74 continue; 75 } 76 if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { 77 continue; 78 } 79 std::cout << "mismatch detected at index " << i << ": " << t_result(i) 80 << " vs " << t_result_gpu(i) << std::endl; 81 assert(false); 82 } 83 84 cudaFree((void*)d_t_left); 85 cudaFree((void*)d_t_right); 86 cudaFree((void*)d_t_result); 87 } 88 89 90 template<int DataLayout> 91 void test_scalar(int m_size, int k_size, int n_size) 92 { 93 std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; 94 // with these dimensions, the output has 300 * 140 elements, which is 95 // more than 30 * 1024, which is the number of threads in blocks on 96 // a 15 SM GK110 GPU 97 Tensor<float, 2, DataLayout> t_left(m_size, k_size); 98 Tensor<float, 2, DataLayout> t_right(k_size, n_size); 99 Tensor<float, 0, DataLayout> t_result; 100 Tensor<float, 0, DataLayout> t_result_gpu; 101 Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1)); 102 103 t_left.setRandom(); 104 t_right.setRandom(); 105 106 std::size_t t_left_bytes = t_left.size() * sizeof(float); 107 std::size_t t_right_bytes = t_right.size() * sizeof(float); 108 std::size_t t_result_bytes = sizeof(float); 109 110 float* d_t_left; 111 float* d_t_right; 112 float* d_t_result; 113 114 cudaMalloc((void**)(&d_t_left), t_left_bytes); 115 cudaMalloc((void**)(&d_t_right), t_right_bytes); 116 cudaMalloc((void**)(&d_t_result), t_result_bytes); 117 118 cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); 119 cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); 120 121 Eigen::CudaStreamDevice stream; 122 Eigen::GpuDevice gpu_device(&stream); 123 124 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 125 gpu_t_left(d_t_left, m_size, k_size); 126 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > 127 gpu_t_right(d_t_right, k_size, n_size); 128 Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > 129 gpu_t_result(d_t_result); 130 131 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); 132 t_result = t_left.contract(t_right, dims); 133 134 cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); 135 if (fabs(t_result() - t_result_gpu()) > 1e-4f && 136 !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { 137 std::cout << "mismatch detected: " << t_result() 138 << " vs " << t_result_gpu() << std::endl; 139 assert(false); 140 } 141 142 cudaFree((void*)d_t_left); 143 cudaFree((void*)d_t_right); 144 cudaFree((void*)d_t_result); 145 } 146 147 148 template<int DataLayout> 149 void test_cuda_contraction_m() { 150 for (int k = 32; k < 256; k++) { 151 test_cuda_contraction<ColMajor>(k, 128, 128); 152 test_cuda_contraction<RowMajor>(k, 128, 128); 153 } 154 } 155 156 template<int DataLayout> 157 void test_cuda_contraction_k() { 158 for (int k = 32; k < 256; k++) { 159 test_cuda_contraction<ColMajor>(128, k, 128); 160 test_cuda_contraction<RowMajor>(128, k, 128); 161 } 162 } 163 164 template<int DataLayout> 165 void test_cuda_contraction_n() { 166 for (int k = 32; k < 256; k++) { 167 test_cuda_contraction<ColMajor>(128, 128, k); 168 test_cuda_contraction<RowMajor>(128, 128, k); 169 } 170 } 171 172 173 template<int DataLayout> 174 void test_cuda_contraction_sizes() { 175 int m_sizes[] = { 31, 39, 63, 64, 65, 176 127, 129, 255, 257 , 511, 177 512, 513, 1023, 1024, 1025}; 178 179 int n_sizes[] = { 31, 39, 63, 64, 65, 180 127, 129, 255, 257, 511, 181 512, 513, 1023, 1024, 1025}; 182 183 int k_sizes[] = { 31, 39, 63, 64, 65, 184 95, 96, 127, 129, 255, 185 257, 511, 512, 513, 1023, 186 1024, 1025}; 187 188 for (int i = 0; i < 15; i++) { 189 for (int j = 0; j < 15; j++) { 190 for (int k = 0; k < 17; k++) { 191 test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); 192 } 193 } 194 } 195 } 196 197 void test_cxx11_tensor_cuda() 198 { 199 CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); 200 CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); 201 202 CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); 203 CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); 204 205 CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); 206 CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); 207 208 CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); 209 CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); 210 211 CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); 212 CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); 213 214 CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); 215 CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); 216 } 217