1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 #define EIGEN_USE_GPU 18 19 #include <numeric> 20 #include "tensorflow/core/lib/core/status_test_util.h" 21 #include "tensorflow/core/platform/test.h" 22 #include "tensorflow/core/util/cuda_kernel_helper.h" 23 #include "tensorflow/core/util/cuda_launch_config.h" 24 25 #define CUDA_EXPECT_SUCCESS \ 26 { \ 27 cudaDeviceSynchronize(); \ 28 cudaError_t err = cudaGetLastError(); \ 29 EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \ 30 } 31 32 #define CUDA_ASSERT_SUCCESS \ 33 { \ 34 cudaDeviceSynchronize(); \ 35 cudaError_t err = cudaGetLastError(); \ 36 ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \ 37 } 38 39 namespace tensorflow { 40 41 namespace { 42 43 __global__ void SetOutbufZero(CudaLaunchConfig config, int* outbuf) { 44 CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; } 45 } 46 47 // counting number of jobs by using atomic +1 48 __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) { 49 CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { 50 if (x < 0) { // x might overflow when testing extreme case 51 break; 52 } 53 atomicAdd(&outbuf[x % bufsize], 1); 54 } 55 } 56 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) { 57 CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { 58 if (x < 0) { // x might overflow when testing extreme case 59 break; 60 } 61 CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) { 62 if (y < 0) { // y might overflow when testing extreme case 63 break; 64 } 65 int idx = x * config.virtual_thread_count.y + y; 66 atomicAdd(&outbuf[idx % bufsize], 1); 67 } 68 } 69 } 70 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) { 71 CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { 72 if (x < 0) { // x might overflow when testing extreme case 73 break; 74 } 75 CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) { 76 if (y < 0) { // y might overflow when testing extreme case 77 break; 78 } 79 CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) { 80 if (z < 0) { // z might overflow when testing extreme case 81 break; 82 } 83 int idx = 84 x * config.virtual_thread_count.y * config.virtual_thread_count.z + 85 y * config.virtual_thread_count.z + z; 86 atomicAdd(&outbuf[idx % bufsize], 1); 87 } 88 } 89 } 90 } 91 92 __global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) { 93 unsigned lane_id = CudaLaneId(); 94 for (int width = warpSize; width > 1; width /= 2) { 95 auto check_result = [&](const char* op_name, int param, unsigned actual, 96 unsigned expected) { 97 if (actual != expected) { 98 printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n", 99 op_name, param, width, lane_id, actual, expected); 100 CudaAtomicAdd(failure_count, 1); 101 } 102 }; 103 for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) { 104 unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width); 105 unsigned expect_lane = 106 CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width); 107 check_result("Shuffle", src_lane, actual_lane, expect_lane); 108 } 109 for (unsigned delta = 0; delta <= warpSize; ++delta) { 110 unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width); 111 unsigned expect_lane = 112 CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width); 113 check_result("ShuffleUp", delta, actual_lane, expect_lane); 114 } 115 for (unsigned delta = 0; delta <= warpSize; ++delta) { 116 unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width); 117 unsigned expect_lane = 118 CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width); 119 check_result("ShuffleDown", delta, actual_lane, expect_lane); 120 } 121 for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) { 122 unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width); 123 unsigned expect_lane = 124 CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width); 125 check_result("ShuffleXor", lane_lane, actual_lane, expect_lane); 126 } 127 } 128 } 129 130 } // namespace 131 132 class CudaLaunchConfigTest : public ::testing::Test { 133 protected: 134 const int bufsize = 1024; 135 int* outbuf = nullptr; 136 Eigen::GpuStreamDevice stream; 137 Eigen::GpuDevice d = Eigen::GpuDevice(&stream); 138 139 virtual void SetUp() { 140 cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize); 141 ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); 142 } 143 144 virtual void TearDown() { 145 cudaDeviceSynchronize(); 146 cudaFree(outbuf); 147 outbuf = nullptr; 148 } 149 }; 150 151 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) { 152 CudaLaunchConfig cfg; 153 154 // test valid inputs 155 #define TEST_LAUNCH_PARAMETER(work_element_count) \ 156 cfg = GetCudaLaunchConfig(bufsize, d); \ 157 TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \ 158 cfg.thread_per_block, 0, d.stream(), cfg, \ 159 outbuf)); \ 160 CUDA_ASSERT_SUCCESS \ 161 cfg = GetCudaLaunchConfig(work_element_count, d); \ 162 TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ 163 0, d.stream(), cfg, bufsize, outbuf)); \ 164 CUDA_EXPECT_SUCCESS \ 165 EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \ 166 \ 167 cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ 168 TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \ 169 cfg.thread_per_block, 0, d.stream(), cfg, \ 170 outbuf)); \ 171 CUDA_ASSERT_SUCCESS \ 172 cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0); \ 173 TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ 174 0, d.stream(), cfg, bufsize, outbuf)); \ 175 CUDA_EXPECT_SUCCESS \ 176 EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)) 177 178 TEST_LAUNCH_PARAMETER(128); 179 TEST_LAUNCH_PARAMETER(129); 180 TEST_LAUNCH_PARAMETER(511); 181 TEST_LAUNCH_PARAMETER(512); 182 TEST_LAUNCH_PARAMETER(2048); 183 TEST_LAUNCH_PARAMETER(2049); 184 TEST_LAUNCH_PARAMETER(8191); 185 TEST_LAUNCH_PARAMETER(8192); 186 TEST_LAUNCH_PARAMETER(123456); 187 TEST_LAUNCH_PARAMETER(1 << 30); 188 #undef TEST_LAUNCH_PARAMETER 189 } 190 191 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) { 192 return a.thread_per_block.x == b.thread_per_block.x && 193 a.thread_per_block.y == b.thread_per_block.y && 194 a.thread_per_block.z == b.thread_per_block.z && 195 a.block_count.x == b.block_count.x && 196 a.block_count.y == b.block_count.y && 197 a.block_count.z == b.block_count.z && 198 a.thread_per_block.x == b.thread_per_block.x && 199 a.thread_per_block.y == b.thread_per_block.y && 200 a.thread_per_block.z == b.thread_per_block.z; 201 } 202 203 TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) { 204 Cuda2DLaunchConfig cfg; 205 CudaLaunchConfig cfg1d; 206 207 // test valid inputs 208 #define TEST_LAUNCH_PARAMETER(dimx, dimy) \ 209 cfg1d = GetCudaLaunchConfig(bufsize, d); \ 210 TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \ 211 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ 212 outbuf)); \ 213 CUDA_ASSERT_SUCCESS \ 214 cfg = GetCuda2DLaunchConfig(dimx, dimy, d); \ 215 TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \ 216 cfg.thread_per_block, 0, d.stream(), cfg, \ 217 bufsize, outbuf)); \ 218 CUDA_EXPECT_SUCCESS \ 219 EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \ 220 \ 221 cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ 222 TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \ 223 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ 224 outbuf)); \ 225 CUDA_ASSERT_SUCCESS \ 226 cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ 227 TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \ 228 cfg.thread_per_block, 0, d.stream(), cfg, \ 229 bufsize, outbuf)); \ 230 CUDA_EXPECT_SUCCESS \ 231 EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)) 232 233 TEST_LAUNCH_PARAMETER(128, 128); 234 TEST_LAUNCH_PARAMETER(129, 64); 235 TEST_LAUNCH_PARAMETER(511, 2048); 236 TEST_LAUNCH_PARAMETER(512, 512); 237 TEST_LAUNCH_PARAMETER(2048, 1024); 238 TEST_LAUNCH_PARAMETER(2049, 32); 239 TEST_LAUNCH_PARAMETER(8191, 1); 240 TEST_LAUNCH_PARAMETER(8192, 10); 241 TEST_LAUNCH_PARAMETER(123456, 12); 242 TEST_LAUNCH_PARAMETER(1, 1 << 30); 243 TEST_LAUNCH_PARAMETER(1 << 30, 1); 244 #undef TEST_LAUNCH_PARAMETER 245 } 246 247 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) { 248 Cuda3DLaunchConfig cfg; 249 CudaLaunchConfig cfg1d; 250 251 // test valid inputs 252 #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \ 253 cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ 254 TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \ 255 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ 256 outbuf)); \ 257 CUDA_ASSERT_SUCCESS \ 258 cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ 259 TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \ 260 cfg.thread_per_block, 0, d.stream(), cfg, \ 261 bufsize, outbuf)); \ 262 CUDA_EXPECT_SUCCESS \ 263 EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0)) 264 265 TEST_LAUNCH_PARAMETER(128, 128, 128); 266 TEST_LAUNCH_PARAMETER(129, 64, 1024); 267 TEST_LAUNCH_PARAMETER(511, 2048, 128); 268 TEST_LAUNCH_PARAMETER(512, 512, 64); 269 TEST_LAUNCH_PARAMETER(2048, 1024, 128); 270 TEST_LAUNCH_PARAMETER(2049, 32, 1024); 271 TEST_LAUNCH_PARAMETER(8191, 1, 1024); 272 TEST_LAUNCH_PARAMETER(8192, 10, 32); 273 TEST_LAUNCH_PARAMETER(123456, 12, 21); 274 TEST_LAUNCH_PARAMETER(1, 1, 1 << 30); 275 TEST_LAUNCH_PARAMETER(1, 1 << 30, 1); 276 TEST_LAUNCH_PARAMETER(1 << 30, 1, 1); 277 #undef TEST_LAUNCH_PARAMETER 278 } 279 280 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) { 281 unsigned* failure_count; 282 ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess); 283 *failure_count = 0; 284 TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr, 285 failure_count)); 286 ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); 287 ASSERT_EQ(*failure_count, 0); 288 cudaFree(failure_count); 289 } 290 291 } // namespace tensorflow 292 293 #endif // GOOGLE_CUDA 294