Home | History | Annotate | Download | only in util
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 #define EIGEN_USE_GPU
     18 
     19 #include <numeric>
     20 #include "tensorflow/core/lib/core/status_test_util.h"
     21 #include "tensorflow/core/platform/test.h"
     22 #include "tensorflow/core/util/cuda_kernel_helper.h"
     23 #include "tensorflow/core/util/cuda_launch_config.h"
     24 
     25 #define CUDA_EXPECT_SUCCESS                                 \
     26   {                                                         \
     27     cudaDeviceSynchronize();                                \
     28     cudaError_t err = cudaGetLastError();                   \
     29     EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
     30   }
     31 
     32 #define CUDA_ASSERT_SUCCESS                                 \
     33   {                                                         \
     34     cudaDeviceSynchronize();                                \
     35     cudaError_t err = cudaGetLastError();                   \
     36     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
     37   }
     38 
     39 namespace tensorflow {
     40 
     41 namespace {
     42 
     43 __global__ void SetOutbufZero(CudaLaunchConfig config, int* outbuf) {
     44   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; }
     45 }
     46 
     47 // counting number of jobs by using atomic +1
     48 __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
     49   CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
     50     if (x < 0) {  // x might overflow when testing extreme case
     51       break;
     52     }
     53     atomicAdd(&outbuf[x % bufsize], 1);
     54   }
     55 }
     56 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
     57   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     58     if (x < 0) {  // x might overflow when testing extreme case
     59       break;
     60     }
     61     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
     62       if (y < 0) {  // y might overflow when testing extreme case
     63         break;
     64       }
     65       int idx = x * config.virtual_thread_count.y + y;
     66       atomicAdd(&outbuf[idx % bufsize], 1);
     67     }
     68   }
     69 }
     70 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
     71   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     72     if (x < 0) {  // x might overflow when testing extreme case
     73       break;
     74     }
     75     CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
     76       if (y < 0) {  // y might overflow when testing extreme case
     77         break;
     78       }
     79       CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {
     80         if (z < 0) {  // z might overflow when testing extreme case
     81           break;
     82         }
     83         int idx =
     84             x * config.virtual_thread_count.y * config.virtual_thread_count.z +
     85             y * config.virtual_thread_count.z + z;
     86         atomicAdd(&outbuf[idx % bufsize], 1);
     87       }
     88     }
     89   }
     90 }
     91 
     92 __global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
     93   unsigned lane_id = CudaLaneId();
     94   for (int width = warpSize; width > 1; width /= 2) {
     95     auto check_result = [&](const char* op_name, int param, unsigned actual,
     96                             unsigned expected) {
     97       if (actual != expected) {
     98         printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
     99                op_name, param, width, lane_id, actual, expected);
    100         CudaAtomicAdd(failure_count, 1);
    101       }
    102     };
    103     for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
    104       unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
    105       unsigned expect_lane =
    106           CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
    107       check_result("Shuffle", src_lane, actual_lane, expect_lane);
    108     }
    109     for (unsigned delta = 0; delta <= warpSize; ++delta) {
    110       unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
    111       unsigned expect_lane =
    112           CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
    113       check_result("ShuffleUp", delta, actual_lane, expect_lane);
    114     }
    115     for (unsigned delta = 0; delta <= warpSize; ++delta) {
    116       unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
    117       unsigned expect_lane =
    118           CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
    119       check_result("ShuffleDown", delta, actual_lane, expect_lane);
    120     }
    121     for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
    122       unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
    123       unsigned expect_lane =
    124           CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
    125       check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
    126     }
    127   }
    128 }
    129 
    130 }  // namespace
    131 
    132 class CudaLaunchConfigTest : public ::testing::Test {
    133  protected:
    134   const int bufsize = 1024;
    135   int* outbuf = nullptr;
    136   Eigen::GpuStreamDevice stream;
    137   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
    138 
    139   virtual void SetUp() {
    140     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
    141     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
    142   }
    143 
    144   virtual void TearDown() {
    145     cudaDeviceSynchronize();
    146     cudaFree(outbuf);
    147     outbuf = nullptr;
    148   }
    149 };
    150 
    151 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
    152   CudaLaunchConfig cfg;
    153 
    154 // test valid inputs
    155 #define TEST_LAUNCH_PARAMETER(work_element_count)                              \
    156   cfg = GetCudaLaunchConfig(bufsize, d);                                       \
    157   TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count,                 \
    158                                cfg.thread_per_block, 0, d.stream(), cfg,       \
    159                                outbuf));                                       \
    160   CUDA_ASSERT_SUCCESS                                                          \
    161   cfg = GetCudaLaunchConfig(work_element_count, d);                            \
    162   TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
    163                                0, d.stream(), cfg, bufsize, outbuf));          \
    164   CUDA_EXPECT_SUCCESS                                                          \
    165   EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
    166                                                                                \
    167   cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
    168   TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count,                 \
    169                                cfg.thread_per_block, 0, d.stream(), cfg,       \
    170                                outbuf));                                       \
    171   CUDA_ASSERT_SUCCESS                                                          \
    172   cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
    173   TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
    174                                0, d.stream(), cfg, bufsize, outbuf));          \
    175   CUDA_EXPECT_SUCCESS                                                          \
    176   EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
    177 
    178   TEST_LAUNCH_PARAMETER(128);
    179   TEST_LAUNCH_PARAMETER(129);
    180   TEST_LAUNCH_PARAMETER(511);
    181   TEST_LAUNCH_PARAMETER(512);
    182   TEST_LAUNCH_PARAMETER(2048);
    183   TEST_LAUNCH_PARAMETER(2049);
    184   TEST_LAUNCH_PARAMETER(8191);
    185   TEST_LAUNCH_PARAMETER(8192);
    186   TEST_LAUNCH_PARAMETER(123456);
    187   TEST_LAUNCH_PARAMETER(1 << 30);
    188 #undef TEST_LAUNCH_PARAMETER
    189 }
    190 
    191 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
    192   return a.thread_per_block.x == b.thread_per_block.x &&
    193          a.thread_per_block.y == b.thread_per_block.y &&
    194          a.thread_per_block.z == b.thread_per_block.z &&
    195          a.block_count.x == b.block_count.x &&
    196          a.block_count.y == b.block_count.y &&
    197          a.block_count.z == b.block_count.z &&
    198          a.thread_per_block.x == b.thread_per_block.x &&
    199          a.thread_per_block.y == b.thread_per_block.y &&
    200          a.thread_per_block.z == b.thread_per_block.z;
    201 }
    202 
    203 TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
    204   Cuda2DLaunchConfig cfg;
    205   CudaLaunchConfig cfg1d;
    206 
    207 // test valid inputs
    208 #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
    209   cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
    210   TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
    211                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
    212                                 outbuf));                                     \
    213   CUDA_ASSERT_SUCCESS                                                         \
    214   cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
    215   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
    216                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
    217                                 bufsize, outbuf));                            \
    218   CUDA_EXPECT_SUCCESS                                                         \
    219   EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));        \
    220                                                                               \
    221   cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
    222   TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
    223                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
    224                                 outbuf));                                     \
    225   CUDA_ASSERT_SUCCESS                                                         \
    226   cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
    227   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
    228                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
    229                                 bufsize, outbuf));                            \
    230   CUDA_EXPECT_SUCCESS                                                         \
    231   EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
    232 
    233   TEST_LAUNCH_PARAMETER(128, 128);
    234   TEST_LAUNCH_PARAMETER(129, 64);
    235   TEST_LAUNCH_PARAMETER(511, 2048);
    236   TEST_LAUNCH_PARAMETER(512, 512);
    237   TEST_LAUNCH_PARAMETER(2048, 1024);
    238   TEST_LAUNCH_PARAMETER(2049, 32);
    239   TEST_LAUNCH_PARAMETER(8191, 1);
    240   TEST_LAUNCH_PARAMETER(8192, 10);
    241   TEST_LAUNCH_PARAMETER(123456, 12);
    242   TEST_LAUNCH_PARAMETER(1, 1 << 30);
    243   TEST_LAUNCH_PARAMETER(1 << 30, 1);
    244 #undef TEST_LAUNCH_PARAMETER
    245 }
    246 
    247 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
    248   Cuda3DLaunchConfig cfg;
    249   CudaLaunchConfig cfg1d;
    250 
    251 // test valid inputs
    252 #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
    253   cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
    254   TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
    255                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
    256                                 outbuf));                                     \
    257   CUDA_ASSERT_SUCCESS                                                         \
    258   cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
    259   TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count,                     \
    260                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
    261                                 bufsize, outbuf));                            \
    262   CUDA_EXPECT_SUCCESS                                                         \
    263   EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
    264 
    265   TEST_LAUNCH_PARAMETER(128, 128, 128);
    266   TEST_LAUNCH_PARAMETER(129, 64, 1024);
    267   TEST_LAUNCH_PARAMETER(511, 2048, 128);
    268   TEST_LAUNCH_PARAMETER(512, 512, 64);
    269   TEST_LAUNCH_PARAMETER(2048, 1024, 128);
    270   TEST_LAUNCH_PARAMETER(2049, 32, 1024);
    271   TEST_LAUNCH_PARAMETER(8191, 1, 1024);
    272   TEST_LAUNCH_PARAMETER(8192, 10, 32);
    273   TEST_LAUNCH_PARAMETER(123456, 12, 21);
    274   TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
    275   TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
    276   TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
    277 #undef TEST_LAUNCH_PARAMETER
    278 }
    279 
    280 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
    281   unsigned* failure_count;
    282   ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
    283   *failure_count = 0;
    284   TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
    285                                 failure_count));
    286   ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
    287   ASSERT_EQ(*failure_count, 0);
    288   cudaFree(failure_count);
    289 }
    290 
    291 }  // namespace tensorflow
    292 
    293 #endif  // GOOGLE_CUDA
    294