Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #define EIGEN_USE_GPU
     19 
     20 #include "tensorflow/core/kernels/depthtospace_op.h"
     21 
     22 #include "tensorflow/core/framework/tensor_types.h"
     23 #include "tensorflow/core/platform/types.h"
     24 #include "tensorflow/core/util/cuda_kernel_helper.h"
     25 
     26 namespace tensorflow {
     27 namespace {
     28 
     29 using GPUDevice = Eigen::GpuDevice;
     30 
     31 // Depth2Space kernel for FORMAT_NHWC.
     32 // See 'depthtospace_op.h' for a more detailed description.
     33 template <typename dtype>
     34 __global__ void D2S_NHWC(const int32 nthreads,
     35                          const dtype* __restrict__ input_ptr,
     36                          const int block_size, const int batch_size,
     37                          const int input_height, const int input_width,
     38                          const int input_depth, const int output_height,
     39                          const int output_width, const int output_depth,
     40                          dtype* __restrict__ output_ptr) {
     41   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     42     // out_idx = d + output_depth * (w + output_width * (h + output_height * b))
     43     const int d = out_idx % output_depth;
     44     const int out_idx2 = out_idx / output_depth;
     45     const int w = out_idx2 % output_width;
     46     const int out_idx3 = out_idx2 / output_width;
     47     const int h = out_idx3 % output_height;
     48     const int b = out_idx3 / output_height;
     49 
     50     const int in_h = h / block_size;
     51     const int offset_h = h % block_size;
     52     const int in_w = w / block_size;
     53     const int offset_w = w % block_size;
     54     const int offset_d = (offset_h * block_size + offset_w) * output_depth;
     55     const int in_d = d + offset_d;
     56     const int inp_idx =
     57         in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
     58     *(output_ptr + out_idx) = ldg(input_ptr + inp_idx);
     59   }
     60 }
     61 
     62 // Depth2Space kernel for FORMAT_NCHW.
     63 // See 'spacetodepth_op.h' for a more detailed description.
     64 template <typename dtype>
     65 __global__ void D2S_NCHW(const int32 nthreads,
     66                          const dtype* __restrict__ input_ptr,
     67                          const int block_size, const int input_width,
     68                          const int output_depth_by_input_height,
     69                          dtype* __restrict__ output_ptr) {
     70   CUDA_1D_KERNEL_LOOP(input_idx, nthreads) {
     71     // We will be converting the image from ordering:
     72     // n, bY, bX, oC, iY, iX    (== input_idx)   to
     73     // n, oC, iY, bY, iX, bX
     74 
     75     // Start reading the input data straight away since we know the address.
     76     // We calculate the output address in parallel while this is being fetched.
     77 
     78     const int n_bY_bX_oC_iY = input_idx / input_width;
     79     const int iX = input_idx - n_bY_bX_oC_iY * input_width;
     80 
     81     const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height;
     82     const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height;
     83 
     84     const int n_bY = n_bY_bX / block_size;
     85     const int bX = n_bY_bX - n_bY * block_size;
     86 
     87     const int n = n_bY / block_size;
     88     const int bY = n_bY - n * block_size;
     89 
     90     const int output_idx =
     91         bX +
     92         block_size *
     93             (iX + input_width *
     94                       (bY + block_size *
     95                                 (oC_iY + n * output_depth_by_input_height)));
     96 
     97     *(output_ptr + output_idx) = ldg(input_ptr + input_idx);
     98   }
     99 }
    100 
    101 template <typename dtype, int block_size>
    102 __global__ void D2S_NCHW_LOOP(const int32 nthreads,
    103                               const dtype* __restrict__ input,
    104                               const int input_width, const int output_width,
    105                               const int output_depth_by_input_area,
    106                               const int input_depth_by_input_area,
    107                               dtype* __restrict__ output) {
    108   CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
    109     // We will be converting the image from ordering:
    110     // n, bY, bX, oC, iY, iX   to
    111     // n, oC, iY, bY, iX, bX
    112 
    113     // We assume thread_idx encodes n_oC_iY_iX, and use an unrolled loop over
    114     // bY and bX coordinates within the block. This kernel is significantly
    115     // more performant than the D2S_NCHW kernel.
    116     //   A likely explanation of the improvement is that although both kernels
    117     // get input coalescing, this one would write the output data more densely
    118     // per warp, so would benefit assuming delayed cache writeback is used.
    119 
    120     const int n_oC_iY = thread_idx / input_width;
    121     const int iX = thread_idx - n_oC_iY * input_width;
    122 
    123     const int n = thread_idx / output_depth_by_input_area;
    124     const int oC_iY_iX = thread_idx - n * output_depth_by_input_area;
    125 
    126     // Recombine the components and apply to the input and output pointers.
    127     auto input_ptr = input + n * input_depth_by_input_area + oC_iY_iX;
    128     auto output_ptr = output + (n_oC_iY * output_width + iX) * block_size;
    129 
    130 #pragma unroll
    131     // Copy a patch of data to the output batch image.
    132     for (int bY = 0; bY < block_size; ++bY) {
    133 #pragma unroll
    134       for (int bX = 0; bX < block_size; ++bX) {
    135         output_ptr[bY * output_width + bX] = ldg(
    136             input_ptr + (bY * block_size + bX) * output_depth_by_input_area);
    137       }
    138     }
    139   }
    140 }
    141 
    142 }  // namespace
    143 
    144 // Specialization of DepthToSpaceOpFunctor for a GPUDevice.
    145 namespace functor {
    146 
    147 template <typename T>
    148 struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NHWC> {
    149   void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    150                   int block_size, typename TTypes<T, 4>::Tensor output) {
    151     const int batch_size = output.dimension(0);
    152     const int input_height = input.dimension(1);
    153     const int input_width = input.dimension(2);
    154     const int input_depth = input.dimension(3);
    155     const int output_height = output.dimension(1);
    156     const int output_width = output.dimension(2);
    157     const int output_depth = output.dimension(3);
    158 
    159     const int total_count =
    160         batch_size * output_height * output_width * output_depth;
    161     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
    162     D2S_NHWC<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    163         config.virtual_thread_count, input.data(), block_size, batch_size,
    164         input_height, input_width, input_depth, output_height, output_width,
    165         output_depth, output.data());
    166   }
    167   void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input,
    168                   int block_size, typename TTypes<T, 5>::Tensor output) {
    169     LOG(FATAL) << "5-D tensors should not be used with NHWC format";
    170   }
    171 };
    172 
    173 template <typename T>
    174 struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
    175   void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    176                   int block_size, typename TTypes<T, 4>::Tensor output) {
    177     const int batch_size = input.dimension(0);
    178     const int input_depth = input.dimension(1);
    179     const int input_height = input.dimension(2);
    180     const int input_width = input.dimension(3);
    181     const int output_depth = output.dimension(1);
    182     const int input_area = input_width * input_height;
    183     const int input_depth_by_input_area = input_depth * input_area;
    184 
    185     // We improve performance by generating instantiations of the loop kernel
    186     // for the most common block sizes.
    187     if (block_size <= 4) {
    188       const int output_width = output.dimension(3);
    189       const int output_depth_by_input_area = output_depth * input_area;
    190       const int total_count = batch_size * output_depth_by_input_area;
    191       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
    192       switch (block_size) {
    193         case 2:
    194           return D2S_NCHW_LOOP<T, 2>
    195               <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    196                   total_count, input.data(), input_width, output_width,
    197                   output_depth_by_input_area, input_depth_by_input_area,
    198                   output.data());
    199         case 3:
    200           return D2S_NCHW_LOOP<T, 3>
    201               <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    202                   total_count, input.data(), input_width, output_width,
    203                   output_depth_by_input_area, input_depth_by_input_area,
    204                   output.data());
    205         case 4:
    206           return D2S_NCHW_LOOP<T, 4>
    207               <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    208                   total_count, input.data(), input_width, output_width,
    209                   output_depth_by_input_area, input_depth_by_input_area,
    210                   output.data());
    211       }
    212     }
    213 
    214     // Other block sizes are processed by the generic kernel.
    215     const int total_count = batch_size * input_depth_by_input_area;
    216     auto config = GetCudaLaunchConfig(total_count, d);
    217     D2S_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    218         config.virtual_thread_count, input.data(), block_size, input_width,
    219         output_depth * input_height, output.data());
    220   }
    221   void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input,
    222                   int block_size, typename TTypes<T, 5>::Tensor output) {
    223     LOG(FATAL) << "5-D tensors should not be used with NCHW format";
    224   }
    225 };
    226 }  // end namespace functor
    227 
    228 // Instantiate the GPU implementations for float.
    229 template struct functor::DepthToSpaceOpFunctor<GPUDevice, float, FORMAT_NCHW>;
    230 template struct functor::DepthToSpaceOpFunctor<GPUDevice, float, FORMAT_NHWC>;
    231 
    232 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
    233 template struct functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
    234 
    235 }  // end namespace tensorflow
    236 
    237 #endif  // GOOGLE_CUDA
    238