1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 18 #define EIGEN_USE_GPU 19 20 #include "tensorflow/core/framework/register_types.h" 21 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h" 22 #include "tensorflow/core/util/cuda_kernel_helper.h" 23 #include "tensorflow/core/util/tensor_format.h" 24 25 namespace tensorflow { 26 27 namespace { 28 29 template <typename dtype> 30 __global__ void MaxPoolGradBackwardNoMaskNCDHW( 31 const int nthreads, const dtype* bottom_data, const dtype* output_data, 32 const int pooled_plane, const int pooled_height, const int pooled_width, 33 const int channels, const int plane, const int height, const int width, 34 const int kernel_p, const int kernel_h, const int kernel_w, 35 const int stride_p, const int stride_h, const int stride_w, const int pad_p, 36 const int pad_t, const int pad_l, const dtype* top_diff, 37 dtype* bottom_diff) { 38 CUDA_1D_KERNEL_LOOP(index, nthreads) { 39 // First find out the index to the maximum, since we have no mask. 40 int pw = index % pooled_width; 41 int ph = (index / pooled_width) % pooled_height; 42 int pp = (index / pooled_width / pooled_height) % pooled_plane; 43 int c = (index / pooled_width / pooled_height / pooled_plane) % channels; 44 int n = (index / pooled_width / pooled_height / pooled_plane / channels); 45 int pstart = pp * stride_p - pad_p; 46 int hstart = ph * stride_h - pad_t; 47 int wstart = pw * stride_w - pad_l; 48 const int pend = min(pstart + kernel_p, plane); 49 const int hend = min(hstart + kernel_h, height); 50 const int wend = min(wstart + kernel_w, width); 51 pstart = max(pstart, 0); 52 hstart = max(hstart, 0); 53 wstart = max(wstart, 0); 54 bool should_stop = false; 55 int maxidx = -1; 56 const dtype* bottom_data_n = 57 bottom_data + n * channels * plane * height * width; 58 // Propagate only first value from top_diff corresponding to the maximum. 59 for (int p = pstart; p < pend && !should_stop; ++p) { 60 for (int h = hstart; h < hend && !should_stop; ++h) { 61 for (int w = wstart; w < wend && !should_stop; ++w) { 62 int idx = c * plane * height * width + (p * height + h) * width + w; 63 if (output_data[index] == bottom_data_n[idx]) { 64 maxidx = idx; 65 should_stop = true; 66 } 67 } 68 } 69 } 70 // Set the bottom diff (atomic is not necessary). The index could still be 71 // uninitialized, if all the bottom_data are NaN. 72 if (maxidx != -1) { 73 bottom_diff[index] = 74 top_diff[n * channels * plane * height * width + maxidx]; 75 } 76 } 77 } 78 79 template <typename dtype> 80 __global__ void MaxPoolGradBackwardNoMaskNDHWC( 81 const int nthreads, const dtype* bottom_data, const dtype* output_data, 82 const int pooled_plane, const int pooled_height, const int pooled_width, 83 const int channels, const int plane, const int height, const int width, 84 const int kernel_p, const int kernel_h, const int kernel_w, 85 const int stride_p, const int stride_h, const int stride_w, const int pad_p, 86 const int pad_t, const int pad_l, const dtype* top_diff, 87 dtype* bottom_diff) { 88 CUDA_1D_KERNEL_LOOP(index, nthreads) { 89 // First find out the index to the maximum, since we have no mask. 90 int n = index; 91 int c = n % channels; 92 n /= channels; 93 int wstart = (n % pooled_width) * stride_w - pad_l; 94 int wend = min(wstart + kernel_w, width); 95 wstart = max(wstart, 0); 96 n /= pooled_width; 97 int hstart = (n % pooled_height) * stride_h - pad_t; 98 int hend = min(hstart + kernel_h, height); 99 hstart = max(hstart, 0); 100 n /= pooled_height; 101 int pstart = (n % pooled_plane) * stride_p - pad_p; 102 int pend = min(pstart + kernel_p, plane); 103 pstart = max(pstart, 0); 104 n /= pooled_plane; 105 bool should_stop = false; 106 int maxidx = -1; 107 const dtype* bottom_data_n = 108 bottom_data + n * plane * height * width * channels; 109 // Propagate only first value from top_diff corresponding to the maximum. 110 for (int p = pstart; p < pend && !should_stop; ++p) { 111 for (int h = hstart; h < hend && !should_stop; ++h) { 112 for (int w = wstart; w < wend && !should_stop; ++w) { 113 int idx = ((p * height + h) * width + w) * channels + c; 114 if (output_data[index] == bottom_data_n[idx]) { 115 maxidx = idx; 116 should_stop = true; 117 } 118 } 119 } 120 } 121 // Set the bottom diff (atomic is not necessary). The index could still be 122 // uninitialized, if all the bottom_data are NaN. 123 if (maxidx != -1) { 124 bottom_diff[index] = 125 top_diff[n * plane * height * width * channels + maxidx]; 126 } 127 } 128 } 129 130 } // namespace 131 132 namespace functor { 133 134 template <typename T> 135 bool MaxPool3dGradBackward<T>::operator()( 136 TensorFormat data_format, const T* bottom_data, const T* output_data, 137 const int batch, const int pooled_plane, const int pooled_height, 138 const int pooled_width, const int channels, const int plane, 139 const int height, const int width, const int kernel_p, const int kernel_h, 140 const int kernel_w, const int stride_p, const int stride_h, 141 const int stride_w, const int pad_p, const int pad_t, const int pad_l, 142 const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) { 143 int num_kernels = 144 batch * channels * pooled_plane * pooled_height * pooled_width; 145 CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); 146 if (data_format == FORMAT_NHWC) { 147 MaxPoolGradBackwardNoMaskNDHWC<<<config.block_count, 148 config.thread_per_block, 0, d.stream()>>>( 149 num_kernels, bottom_data, output_data, pooled_plane, pooled_height, 150 pooled_width, channels, plane, height, width, kernel_p, kernel_h, 151 kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff, 152 bottom_diff); 153 } else { 154 MaxPoolGradBackwardNoMaskNCDHW<<<config.block_count, 155 config.thread_per_block, 0, d.stream()>>>( 156 num_kernels, bottom_data, output_data, pooled_plane, pooled_height, 157 pooled_width, channels, plane, height, width, kernel_p, kernel_h, 158 kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff, 159 bottom_diff); 160 } 161 return d.ok(); 162 } 163 164 } // namespace functor 165 166 #define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>; 167 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); 168 #undef DEFINE_GPU_SPECS 169 170 } // namespace tensorflow 171 172 #endif // GOOGLE_CUDA 173