Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #define EIGEN_USE_GPU
     19 
     20 #include "tensorflow/core/framework/register_types.h"
     21 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
     22 #include "tensorflow/core/util/cuda_kernel_helper.h"
     23 #include "tensorflow/core/util/tensor_format.h"
     24 
     25 namespace tensorflow {
     26 
     27 namespace {
     28 
     29 template <typename dtype>
     30 __global__ void MaxPoolGradBackwardNoMaskNCDHW(
     31     const int nthreads, const dtype* bottom_data, const dtype* output_data,
     32     const int pooled_plane, const int pooled_height, const int pooled_width,
     33     const int channels, const int plane, const int height, const int width,
     34     const int kernel_p, const int kernel_h, const int kernel_w,
     35     const int stride_p, const int stride_h, const int stride_w, const int pad_p,
     36     const int pad_t, const int pad_l, const dtype* top_diff,
     37     dtype* bottom_diff) {
     38   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     39     // First find out the index to the maximum, since we have no mask.
     40     int pw = index % pooled_width;
     41     int ph = (index / pooled_width) % pooled_height;
     42     int pp = (index / pooled_width / pooled_height) % pooled_plane;
     43     int c = (index / pooled_width / pooled_height / pooled_plane) % channels;
     44     int n = (index / pooled_width / pooled_height / pooled_plane / channels);
     45     int pstart = pp * stride_p - pad_p;
     46     int hstart = ph * stride_h - pad_t;
     47     int wstart = pw * stride_w - pad_l;
     48     const int pend = min(pstart + kernel_p, plane);
     49     const int hend = min(hstart + kernel_h, height);
     50     const int wend = min(wstart + kernel_w, width);
     51     pstart = max(pstart, 0);
     52     hstart = max(hstart, 0);
     53     wstart = max(wstart, 0);
     54     bool should_stop = false;
     55     int maxidx = -1;
     56     const dtype* bottom_data_n =
     57         bottom_data + n * channels * plane * height * width;
     58     // Propagate only first value from top_diff corresponding to the maximum.
     59     for (int p = pstart; p < pend && !should_stop; ++p) {
     60       for (int h = hstart; h < hend && !should_stop; ++h) {
     61         for (int w = wstart; w < wend && !should_stop; ++w) {
     62           int idx = c * plane * height * width + (p * height + h) * width + w;
     63           if (output_data[index] == bottom_data_n[idx]) {
     64             maxidx = idx;
     65             should_stop = true;
     66           }
     67         }
     68       }
     69     }
     70     // Set the bottom diff (atomic is not necessary). The index could still be
     71     // uninitialized, if all the bottom_data are NaN.
     72     if (maxidx != -1) {
     73       bottom_diff[index] =
     74           top_diff[n * channels * plane * height * width + maxidx];
     75     }
     76   }
     77 }
     78 
     79 template <typename dtype>
     80 __global__ void MaxPoolGradBackwardNoMaskNDHWC(
     81     const int nthreads, const dtype* bottom_data, const dtype* output_data,
     82     const int pooled_plane, const int pooled_height, const int pooled_width,
     83     const int channels, const int plane, const int height, const int width,
     84     const int kernel_p, const int kernel_h, const int kernel_w,
     85     const int stride_p, const int stride_h, const int stride_w, const int pad_p,
     86     const int pad_t, const int pad_l, const dtype* top_diff,
     87     dtype* bottom_diff) {
     88   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     89     // First find out the index to the maximum, since we have no mask.
     90     int n = index;
     91     int c = n % channels;
     92     n /= channels;
     93     int wstart = (n % pooled_width) * stride_w - pad_l;
     94     int wend = min(wstart + kernel_w, width);
     95     wstart = max(wstart, 0);
     96     n /= pooled_width;
     97     int hstart = (n % pooled_height) * stride_h - pad_t;
     98     int hend = min(hstart + kernel_h, height);
     99     hstart = max(hstart, 0);
    100     n /= pooled_height;
    101     int pstart = (n % pooled_plane) * stride_p - pad_p;
    102     int pend = min(pstart + kernel_p, plane);
    103     pstart = max(pstart, 0);
    104     n /= pooled_plane;
    105     bool should_stop = false;
    106     int maxidx = -1;
    107     const dtype* bottom_data_n =
    108         bottom_data + n * plane * height * width * channels;
    109     // Propagate only first value from top_diff corresponding to the maximum.
    110     for (int p = pstart; p < pend && !should_stop; ++p) {
    111       for (int h = hstart; h < hend && !should_stop; ++h) {
    112         for (int w = wstart; w < wend && !should_stop; ++w) {
    113           int idx = ((p * height + h) * width + w) * channels + c;
    114           if (output_data[index] == bottom_data_n[idx]) {
    115             maxidx = idx;
    116             should_stop = true;
    117           }
    118         }
    119       }
    120     }
    121     // Set the bottom diff (atomic is not necessary). The index could still be
    122     // uninitialized, if all the bottom_data are NaN.
    123     if (maxidx != -1) {
    124       bottom_diff[index] =
    125           top_diff[n * plane * height * width * channels + maxidx];
    126     }
    127   }
    128 }
    129 
    130 }  // namespace
    131 
    132 namespace functor {
    133 
    134 template <typename T>
    135 bool MaxPool3dGradBackward<T>::operator()(
    136     TensorFormat data_format, const T* bottom_data, const T* output_data,
    137     const int batch, const int pooled_plane, const int pooled_height,
    138     const int pooled_width, const int channels, const int plane,
    139     const int height, const int width, const int kernel_p, const int kernel_h,
    140     const int kernel_w, const int stride_p, const int stride_h,
    141     const int stride_w, const int pad_p, const int pad_t, const int pad_l,
    142     const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
    143   int num_kernels =
    144       batch * channels * pooled_plane * pooled_height * pooled_width;
    145   CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
    146   if (data_format == FORMAT_NHWC) {
    147     MaxPoolGradBackwardNoMaskNDHWC<<<config.block_count,
    148                                      config.thread_per_block, 0, d.stream()>>>(
    149         num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
    150         pooled_width, channels, plane, height, width, kernel_p, kernel_h,
    151         kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
    152         bottom_diff);
    153   } else {
    154     MaxPoolGradBackwardNoMaskNCDHW<<<config.block_count,
    155                                      config.thread_per_block, 0, d.stream()>>>(
    156         num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
    157         pooled_width, channels, plane, height, width, kernel_p, kernel_h,
    158         kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
    159         bottom_diff);
    160   }
    161   return d.ok();
    162 }
    163 
    164 }  // namespace functor
    165 
    166 #define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>;
    167 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
    168 #undef DEFINE_GPU_SPECS
    169 
    170 }  // namespace tensorflow
    171 
    172 #endif  // GOOGLE_CUDA
    173