Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/nn_ops.cc.
     17 
     18 #if GOOGLE_CUDA
     19 
     20 #define EIGEN_USE_GPU
     21 
     22 #include <cfloat>
     23 #include <vector>
     24 
     25 #include "tensorflow/core/kernels/dilation_ops.h"
     26 
     27 #include "tensorflow/core/framework/register_types.h"
     28 #include "tensorflow/core/framework/tensor_types.h"
     29 #include "tensorflow/core/platform/types.h"
     30 #include "tensorflow/core/util/cuda_kernel_helper.h"
     31 
     32 namespace tensorflow {
     33 
     34 typedef Eigen::GpuDevice GPUDevice;
     35 
     36 namespace {
     37 
     38 template <typename T>
     39 __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
     40                                const T* filter_ptr, int batch, int input_rows,
     41                                int input_cols, int depth, int filter_rows,
     42                                int filter_cols, int output_rows,
     43                                int output_cols, int stride_rows,
     44                                int stride_cols, int rate_rows, int rate_cols,
     45                                int pad_top, int pad_left, T* output_ptr) {
     46   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     47     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     48     const int d = out_idx % depth;
     49     const int out_idx2 = out_idx / depth;
     50     const int w_out = out_idx2 % output_cols;
     51     const int out_idx3 = out_idx2 / output_cols;
     52     const int h_out = out_idx3 % output_rows;
     53     const int b = out_idx3 / output_rows;
     54     int h_beg = h_out * stride_rows - pad_top;
     55     int w_beg = w_out * stride_cols - pad_left;
     56     T cur_val = Eigen::NumTraits<T>::lowest();
     57     for (int h = 0; h < filter_rows; ++h) {
     58       const int h_in = h_beg + h * rate_rows;
     59       if (h_in >= 0 && h_in < input_rows) {
     60         for (int w = 0; w < filter_cols; ++w) {
     61           const int w_in = w_beg + w * rate_cols;
     62           if (w_in >= 0 && w_in < input_cols) {
     63             const T val =
     64                 input_ptr[d + depth * (w_in +
     65                                        input_cols * (h_in + input_rows * b))] +
     66                 filter_ptr[d + depth * (w + filter_cols * h)];
     67             if (val > cur_val) {
     68               cur_val = val;
     69             }
     70           }
     71         }
     72       }
     73     }
     74     output_ptr[out_idx] = cur_val;
     75   }
     76 }
     77 
     78 template <typename T>
     79 __global__ void DilationBackpropInputKernel(
     80     const int32 nthreads, const T* input_ptr, const T* filter_ptr,
     81     const T* out_backprop_ptr, int batch, int input_rows, int input_cols,
     82     int depth, int filter_rows, int filter_cols, int output_rows,
     83     int output_cols, int stride_rows, int stride_cols, int rate_rows,
     84     int rate_cols, int pad_top, int pad_left, T* in_backprop_ptr) {
     85   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     86     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
     87     const int d = out_idx % depth;
     88     const int out_idx2 = out_idx / depth;
     89     const int w_out = out_idx2 % output_cols;
     90     const int out_idx3 = out_idx2 / output_cols;
     91     const int h_out = out_idx3 % output_rows;
     92     const int b = out_idx3 / output_rows;
     93     int h_beg = h_out * stride_rows - pad_top;
     94     int w_beg = w_out * stride_cols - pad_left;
     95     T cur_val = Eigen::NumTraits<T>::lowest();
     96     int h_in_max = (h_beg < 0) ? 0 : h_beg;
     97     int w_in_max = (w_beg < 0) ? 0 : w_beg;
     98     // In the case of multiple argmax branches, we only back-propagate along the
     99     // last branch, i.e., the one with largest value of `h * filter_cols + w`,
    100     // similarly to the max-pooling backward routines.
    101     for (int h = 0; h < filter_rows; ++h) {
    102       const int h_in = h_beg + h * rate_rows;
    103       if (h_in >= 0 && h_in < input_rows) {
    104         for (int w = 0; w < filter_cols; ++w) {
    105           const int w_in = w_beg + w * rate_cols;
    106           if (w_in >= 0 && w_in < input_cols) {
    107             const T val =
    108                 input_ptr[d + depth * (w_in +
    109                                        input_cols * (h_in + input_rows * b))] +
    110                 filter_ptr[d + depth * (w + filter_cols * h)];
    111             if (val > cur_val) {
    112               cur_val = val;
    113               h_in_max = h_in;
    114               w_in_max = w_in;
    115             }
    116           }
    117         }
    118       }
    119     }
    120     CudaAtomicAdd(
    121         in_backprop_ptr + d +
    122             depth * (w_in_max + input_cols * (h_in_max + input_rows * b)),
    123         out_backprop_ptr[out_idx]);
    124   }
    125 }
    126 
    127 template <typename T>
    128 __global__ void DilationBackpropFilterKernel(
    129     const int32 nthreads, const T* input_ptr, const T* filter_ptr,
    130     const T* out_backprop_ptr, int batch, int input_rows, int input_cols,
    131     int depth, int filter_rows, int filter_cols, int output_rows,
    132     int output_cols, int stride_rows, int stride_cols, int rate_rows,
    133     int rate_cols, int pad_top, int pad_left, T* filter_backprop_ptr) {
    134   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
    135     // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b))
    136     const int d = out_idx % depth;
    137     const int out_idx2 = out_idx / depth;
    138     const int w_out = out_idx2 % output_cols;
    139     const int out_idx3 = out_idx2 / output_cols;
    140     const int h_out = out_idx3 % output_rows;
    141     const int b = out_idx3 / output_rows;
    142     int h_beg = h_out * stride_rows - pad_top;
    143     int w_beg = w_out * stride_cols - pad_left;
    144     T cur_val = Eigen::NumTraits<T>::lowest();
    145     int h_max = 0;
    146     int w_max = 0;
    147     // In the case of multiple argmax branches, we only back-propagate along the
    148     // last branch, i.e., the one with largest value of `h * filter_cols + w`,
    149     // similarly to the max-pooling backward routines.
    150     for (int h = 0; h < filter_rows; ++h) {
    151       const int h_in = h_beg + h * rate_rows;
    152       if (h_in >= 0 && h_in < input_rows) {
    153         for (int w = 0; w < filter_cols; ++w) {
    154           const int w_in = w_beg + w * rate_cols;
    155           if (w_in >= 0 && w_in < input_cols) {
    156             const T val =
    157                 input_ptr[d + depth * (w_in +
    158                                        input_cols * (h_in + input_rows * b))] +
    159                 filter_ptr[d + depth * (w + filter_cols * h)];
    160             if (val > cur_val) {
    161               cur_val = val;
    162               h_max = h;
    163               w_max = w;
    164             }
    165           }
    166         }
    167       }
    168     }
    169     CudaAtomicAdd(
    170         filter_backprop_ptr + d + depth * (w_max + filter_cols * h_max),
    171         out_backprop_ptr[out_idx]);
    172   }
    173 }
    174 
    175 }  // namespace
    176 
    177 namespace functor {
    178 
    179 template <typename T>
    180 struct Dilation<GPUDevice, T> {
    181   void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    182                   typename TTypes<T, 3>::ConstTensor filter, int stride_rows,
    183                   int stride_cols, int rate_rows, int rate_cols, int pad_top,
    184                   int pad_left, typename TTypes<T, 4>::Tensor output) {
    185     const int batch = input.dimension(0);
    186     const int input_rows = input.dimension(1);
    187     const int input_cols = input.dimension(2);
    188     const int depth = input.dimension(3);
    189 
    190     const int filter_rows = filter.dimension(0);
    191     const int filter_cols = filter.dimension(1);
    192 
    193     const int output_rows = output.dimension(1);
    194     const int output_cols = output.dimension(2);
    195 
    196     const int total_count = batch * output_rows * output_cols * depth;
    197     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
    198 
    199     DilationKernel<<<config.block_count, config.thread_per_block, 0,
    200                      d.stream()>>>(
    201         config.virtual_thread_count, input.data(), filter.data(), batch,
    202         input_rows, input_cols, depth, filter_rows, filter_cols, output_rows,
    203         output_cols, stride_rows, stride_cols, rate_rows, rate_cols, pad_top,
    204         pad_left, output.data());
    205   }
    206 };
    207 
    208 template <typename T>
    209 struct DilationBackpropInput<GPUDevice, T> {
    210   void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    211                   typename TTypes<T, 3>::ConstTensor filter,
    212                   typename TTypes<T, 4>::ConstTensor out_backprop,
    213                   int stride_rows, int stride_cols, int rate_rows,
    214                   int rate_cols, int pad_top, int pad_left,
    215                   typename TTypes<T, 4>::Tensor in_backprop) {
    216     const int batch = input.dimension(0);
    217     const int input_rows = input.dimension(1);
    218     const int input_cols = input.dimension(2);
    219     const int depth = input.dimension(3);
    220 
    221     const int filter_rows = filter.dimension(0);
    222     const int filter_cols = filter.dimension(1);
    223 
    224     const int output_rows = out_backprop.dimension(1);
    225     const int output_cols = out_backprop.dimension(2);
    226 
    227     int total_count;
    228     CudaLaunchConfig config;
    229 
    230     // Initialize in_backprop with all zeros.
    231     total_count = batch * input_rows * input_cols * depth;
    232     config = GetCudaLaunchConfig(total_count, d);
    233     SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    234         total_count, in_backprop.data());
    235 
    236     // Accumulate.
    237     total_count = batch * output_rows * output_cols * depth;
    238     config = GetCudaLaunchConfig(total_count, d);
    239     DilationBackpropInputKernel<<<config.block_count, config.thread_per_block,
    240                                   0, d.stream()>>>(
    241         config.virtual_thread_count, input.data(), filter.data(),
    242         out_backprop.data(), batch, input_rows, input_cols, depth, filter_rows,
    243         filter_cols, output_rows, output_cols, stride_rows, stride_cols,
    244         rate_rows, rate_cols, pad_top, pad_left, in_backprop.data());
    245   }
    246 };
    247 
    248 template <typename T>
    249 struct DilationBackpropFilter<GPUDevice, T> {
    250   void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    251                   typename TTypes<T, 3>::ConstTensor filter,
    252                   typename TTypes<T, 4>::ConstTensor out_backprop,
    253                   int stride_rows, int stride_cols, int rate_rows,
    254                   int rate_cols, int pad_top, int pad_left,
    255                   typename TTypes<T, 3>::Tensor filter_backprop) {
    256     const int batch = input.dimension(0);
    257     const int input_rows = input.dimension(1);
    258     const int input_cols = input.dimension(2);
    259     const int depth = input.dimension(3);
    260 
    261     const int filter_rows = filter.dimension(0);
    262     const int filter_cols = filter.dimension(1);
    263 
    264     const int output_rows = out_backprop.dimension(1);
    265     const int output_cols = out_backprop.dimension(2);
    266 
    267     int total_count;
    268     CudaLaunchConfig config;
    269 
    270     // Initialize filter_backprop with all zeros.
    271     total_count = filter_rows * filter_cols * depth;
    272     config = GetCudaLaunchConfig(total_count, d);
    273     SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
    274         total_count, filter_backprop.data());
    275 
    276     // Accumulate.
    277     total_count = batch * output_rows * output_cols * depth;
    278     config = GetCudaLaunchConfig(total_count, d);
    279     DilationBackpropFilterKernel<<<config.block_count, config.thread_per_block,
    280                                    0, d.stream()>>>(
    281         config.virtual_thread_count, input.data(), filter.data(),
    282         out_backprop.data(), batch, input_rows, input_cols, depth, filter_rows,
    283         filter_cols, output_rows, output_cols, stride_rows, stride_cols,
    284         rate_rows, rate_cols, pad_top, pad_left, filter_backprop.data());
    285   }
    286 };
    287 
    288 }  // namespace functor
    289 
    290 #define DEFINE_GPU_SPECS(T)                                     \
    291   template struct functor::Dilation<GPUDevice, T>;              \
    292   template struct functor::DilationBackpropInput<GPUDevice, T>; \
    293   template struct functor::DilationBackpropFilter<GPUDevice, T>;
    294 
    295 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
    296 
    297 #undef DEFINE_GPU_SPECS
    298 
    299 }  // namespace tensorflow
    300 
    301 #endif  // GOOGLE_CUDA
    302