1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/nn_ops.cc. 17 18 #if GOOGLE_CUDA 19 20 #define EIGEN_USE_GPU 21 22 #include <cfloat> 23 #include <vector> 24 25 #include "tensorflow/core/kernels/dilation_ops.h" 26 27 #include "tensorflow/core/framework/register_types.h" 28 #include "tensorflow/core/framework/tensor_types.h" 29 #include "tensorflow/core/platform/types.h" 30 #include "tensorflow/core/util/cuda_kernel_helper.h" 31 32 namespace tensorflow { 33 34 typedef Eigen::GpuDevice GPUDevice; 35 36 namespace { 37 38 template <typename T> 39 __global__ void DilationKernel(const int32 nthreads, const T* input_ptr, 40 const T* filter_ptr, int batch, int input_rows, 41 int input_cols, int depth, int filter_rows, 42 int filter_cols, int output_rows, 43 int output_cols, int stride_rows, 44 int stride_cols, int rate_rows, int rate_cols, 45 int pad_top, int pad_left, T* output_ptr) { 46 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 47 // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b)) 48 const int d = out_idx % depth; 49 const int out_idx2 = out_idx / depth; 50 const int w_out = out_idx2 % output_cols; 51 const int out_idx3 = out_idx2 / output_cols; 52 const int h_out = out_idx3 % output_rows; 53 const int b = out_idx3 / output_rows; 54 int h_beg = h_out * stride_rows - pad_top; 55 int w_beg = w_out * stride_cols - pad_left; 56 T cur_val = Eigen::NumTraits<T>::lowest(); 57 for (int h = 0; h < filter_rows; ++h) { 58 const int h_in = h_beg + h * rate_rows; 59 if (h_in >= 0 && h_in < input_rows) { 60 for (int w = 0; w < filter_cols; ++w) { 61 const int w_in = w_beg + w * rate_cols; 62 if (w_in >= 0 && w_in < input_cols) { 63 const T val = 64 input_ptr[d + depth * (w_in + 65 input_cols * (h_in + input_rows * b))] + 66 filter_ptr[d + depth * (w + filter_cols * h)]; 67 if (val > cur_val) { 68 cur_val = val; 69 } 70 } 71 } 72 } 73 } 74 output_ptr[out_idx] = cur_val; 75 } 76 } 77 78 template <typename T> 79 __global__ void DilationBackpropInputKernel( 80 const int32 nthreads, const T* input_ptr, const T* filter_ptr, 81 const T* out_backprop_ptr, int batch, int input_rows, int input_cols, 82 int depth, int filter_rows, int filter_cols, int output_rows, 83 int output_cols, int stride_rows, int stride_cols, int rate_rows, 84 int rate_cols, int pad_top, int pad_left, T* in_backprop_ptr) { 85 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 86 // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b)) 87 const int d = out_idx % depth; 88 const int out_idx2 = out_idx / depth; 89 const int w_out = out_idx2 % output_cols; 90 const int out_idx3 = out_idx2 / output_cols; 91 const int h_out = out_idx3 % output_rows; 92 const int b = out_idx3 / output_rows; 93 int h_beg = h_out * stride_rows - pad_top; 94 int w_beg = w_out * stride_cols - pad_left; 95 T cur_val = Eigen::NumTraits<T>::lowest(); 96 int h_in_max = (h_beg < 0) ? 0 : h_beg; 97 int w_in_max = (w_beg < 0) ? 0 : w_beg; 98 // In the case of multiple argmax branches, we only back-propagate along the 99 // last branch, i.e., the one with largest value of `h * filter_cols + w`, 100 // similarly to the max-pooling backward routines. 101 for (int h = 0; h < filter_rows; ++h) { 102 const int h_in = h_beg + h * rate_rows; 103 if (h_in >= 0 && h_in < input_rows) { 104 for (int w = 0; w < filter_cols; ++w) { 105 const int w_in = w_beg + w * rate_cols; 106 if (w_in >= 0 && w_in < input_cols) { 107 const T val = 108 input_ptr[d + depth * (w_in + 109 input_cols * (h_in + input_rows * b))] + 110 filter_ptr[d + depth * (w + filter_cols * h)]; 111 if (val > cur_val) { 112 cur_val = val; 113 h_in_max = h_in; 114 w_in_max = w_in; 115 } 116 } 117 } 118 } 119 } 120 CudaAtomicAdd( 121 in_backprop_ptr + d + 122 depth * (w_in_max + input_cols * (h_in_max + input_rows * b)), 123 out_backprop_ptr[out_idx]); 124 } 125 } 126 127 template <typename T> 128 __global__ void DilationBackpropFilterKernel( 129 const int32 nthreads, const T* input_ptr, const T* filter_ptr, 130 const T* out_backprop_ptr, int batch, int input_rows, int input_cols, 131 int depth, int filter_rows, int filter_cols, int output_rows, 132 int output_cols, int stride_rows, int stride_cols, int rate_rows, 133 int rate_cols, int pad_top, int pad_left, T* filter_backprop_ptr) { 134 CUDA_1D_KERNEL_LOOP(out_idx, nthreads) { 135 // out_idx = d + depth * (w_out + output_cols * (h_out + output_rows * b)) 136 const int d = out_idx % depth; 137 const int out_idx2 = out_idx / depth; 138 const int w_out = out_idx2 % output_cols; 139 const int out_idx3 = out_idx2 / output_cols; 140 const int h_out = out_idx3 % output_rows; 141 const int b = out_idx3 / output_rows; 142 int h_beg = h_out * stride_rows - pad_top; 143 int w_beg = w_out * stride_cols - pad_left; 144 T cur_val = Eigen::NumTraits<T>::lowest(); 145 int h_max = 0; 146 int w_max = 0; 147 // In the case of multiple argmax branches, we only back-propagate along the 148 // last branch, i.e., the one with largest value of `h * filter_cols + w`, 149 // similarly to the max-pooling backward routines. 150 for (int h = 0; h < filter_rows; ++h) { 151 const int h_in = h_beg + h * rate_rows; 152 if (h_in >= 0 && h_in < input_rows) { 153 for (int w = 0; w < filter_cols; ++w) { 154 const int w_in = w_beg + w * rate_cols; 155 if (w_in >= 0 && w_in < input_cols) { 156 const T val = 157 input_ptr[d + depth * (w_in + 158 input_cols * (h_in + input_rows * b))] + 159 filter_ptr[d + depth * (w + filter_cols * h)]; 160 if (val > cur_val) { 161 cur_val = val; 162 h_max = h; 163 w_max = w; 164 } 165 } 166 } 167 } 168 } 169 CudaAtomicAdd( 170 filter_backprop_ptr + d + depth * (w_max + filter_cols * h_max), 171 out_backprop_ptr[out_idx]); 172 } 173 } 174 175 } // namespace 176 177 namespace functor { 178 179 template <typename T> 180 struct Dilation<GPUDevice, T> { 181 void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 182 typename TTypes<T, 3>::ConstTensor filter, int stride_rows, 183 int stride_cols, int rate_rows, int rate_cols, int pad_top, 184 int pad_left, typename TTypes<T, 4>::Tensor output) { 185 const int batch = input.dimension(0); 186 const int input_rows = input.dimension(1); 187 const int input_cols = input.dimension(2); 188 const int depth = input.dimension(3); 189 190 const int filter_rows = filter.dimension(0); 191 const int filter_cols = filter.dimension(1); 192 193 const int output_rows = output.dimension(1); 194 const int output_cols = output.dimension(2); 195 196 const int total_count = batch * output_rows * output_cols * depth; 197 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); 198 199 DilationKernel<<<config.block_count, config.thread_per_block, 0, 200 d.stream()>>>( 201 config.virtual_thread_count, input.data(), filter.data(), batch, 202 input_rows, input_cols, depth, filter_rows, filter_cols, output_rows, 203 output_cols, stride_rows, stride_cols, rate_rows, rate_cols, pad_top, 204 pad_left, output.data()); 205 } 206 }; 207 208 template <typename T> 209 struct DilationBackpropInput<GPUDevice, T> { 210 void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 211 typename TTypes<T, 3>::ConstTensor filter, 212 typename TTypes<T, 4>::ConstTensor out_backprop, 213 int stride_rows, int stride_cols, int rate_rows, 214 int rate_cols, int pad_top, int pad_left, 215 typename TTypes<T, 4>::Tensor in_backprop) { 216 const int batch = input.dimension(0); 217 const int input_rows = input.dimension(1); 218 const int input_cols = input.dimension(2); 219 const int depth = input.dimension(3); 220 221 const int filter_rows = filter.dimension(0); 222 const int filter_cols = filter.dimension(1); 223 224 const int output_rows = out_backprop.dimension(1); 225 const int output_cols = out_backprop.dimension(2); 226 227 int total_count; 228 CudaLaunchConfig config; 229 230 // Initialize in_backprop with all zeros. 231 total_count = batch * input_rows * input_cols * depth; 232 config = GetCudaLaunchConfig(total_count, d); 233 SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 234 total_count, in_backprop.data()); 235 236 // Accumulate. 237 total_count = batch * output_rows * output_cols * depth; 238 config = GetCudaLaunchConfig(total_count, d); 239 DilationBackpropInputKernel<<<config.block_count, config.thread_per_block, 240 0, d.stream()>>>( 241 config.virtual_thread_count, input.data(), filter.data(), 242 out_backprop.data(), batch, input_rows, input_cols, depth, filter_rows, 243 filter_cols, output_rows, output_cols, stride_rows, stride_cols, 244 rate_rows, rate_cols, pad_top, pad_left, in_backprop.data()); 245 } 246 }; 247 248 template <typename T> 249 struct DilationBackpropFilter<GPUDevice, T> { 250 void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 251 typename TTypes<T, 3>::ConstTensor filter, 252 typename TTypes<T, 4>::ConstTensor out_backprop, 253 int stride_rows, int stride_cols, int rate_rows, 254 int rate_cols, int pad_top, int pad_left, 255 typename TTypes<T, 3>::Tensor filter_backprop) { 256 const int batch = input.dimension(0); 257 const int input_rows = input.dimension(1); 258 const int input_cols = input.dimension(2); 259 const int depth = input.dimension(3); 260 261 const int filter_rows = filter.dimension(0); 262 const int filter_cols = filter.dimension(1); 263 264 const int output_rows = out_backprop.dimension(1); 265 const int output_cols = out_backprop.dimension(2); 266 267 int total_count; 268 CudaLaunchConfig config; 269 270 // Initialize filter_backprop with all zeros. 271 total_count = filter_rows * filter_cols * depth; 272 config = GetCudaLaunchConfig(total_count, d); 273 SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 274 total_count, filter_backprop.data()); 275 276 // Accumulate. 277 total_count = batch * output_rows * output_cols * depth; 278 config = GetCudaLaunchConfig(total_count, d); 279 DilationBackpropFilterKernel<<<config.block_count, config.thread_per_block, 280 0, d.stream()>>>( 281 config.virtual_thread_count, input.data(), filter.data(), 282 out_backprop.data(), batch, input_rows, input_cols, depth, filter_rows, 283 filter_cols, output_rows, output_cols, stride_rows, stride_cols, 284 rate_rows, rate_cols, pad_top, pad_left, filter_backprop.data()); 285 } 286 }; 287 288 } // namespace functor 289 290 #define DEFINE_GPU_SPECS(T) \ 291 template struct functor::Dilation<GPUDevice, T>; \ 292 template struct functor::DilationBackpropInput<GPUDevice, T>; \ 293 template struct functor::DilationBackpropFilter<GPUDevice, T>; 294 295 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); 296 297 #undef DEFINE_GPU_SPECS 298 299 } // namespace tensorflow 300 301 #endif // GOOGLE_CUDA 302