1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 18 #define EIGEN_USE_GPU 19 20 #include <stdio.h> 21 #include <cfloat> 22 23 #include "tensorflow/core/framework/register_types.h" 24 #include "tensorflow/core/framework/tensor_types.h" 25 #include "tensorflow/core/framework/type_traits.h" 26 #include "tensorflow/core/kernels/maxpooling_op.h" 27 #include "tensorflow/core/kernels/maxpooling_op_gpu.h" 28 #include "tensorflow/core/util/cuda_kernel_helper.h" 29 30 namespace tensorflow { 31 namespace { 32 template <bool propagate_nans, typename dtype> 33 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) { 34 if (propagate_nans) { 35 return !(a <= b); 36 } else { 37 return a > b; 38 } 39 } 40 41 // This is Yangqing's custom kernel for the maxpooling operation. There are 42 // three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two 43 // forward functions, dealing with the forward case. MaxPoolBackward is the 44 // backward function that deals with the backward case for both storage orders. 45 // The parameters to the kernels in the forward function is as follows: 46 // nthreads: the number of threads, which is equal to the output size. 47 // bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items. 48 // height, width, pooled_height, pooled_width: the input and output sizes. 49 // kernel_h, kernel_w: the kernel sizes. 50 // stride_h, stride_w: the strides. 51 // pad_t, pad_l: the padding values on the top and left side. 52 // top_data: the maxpool output. 53 // mask: the output mask of the same size as top_data. It is stored in 54 // int form, keeping track of the flattened index of the input item that 55 // produces the max output. If a nullptr is passed in for mask, no mask 56 // will be produced. 57 // 58 // To call the forward and backward functions, use e.g.: 59 // const int kThreadsPerBlock = 1024 60 // const int output_size = batch * channels * pooled_height * pooled_width; 61 // MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, 62 // kThreadsPerBlock, 0, cuda_stream>>>(...); 63 template <bool propagate_nans, typename dtype> 64 __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data, 65 const int channels, const int height, 66 const int width, const int pooled_height, 67 const int pooled_width, const int kernel_h, 68 const int kernel_w, const int stride_h, 69 const int stride_w, const int pad_t, 70 const int pad_l, dtype* top_data, 71 int64* mask) { 72 CUDA_1D_KERNEL_LOOP(index, nthreads) { 73 int pw = index % pooled_width; 74 int ph = (index / pooled_width) % pooled_height; 75 int c = (index / pooled_width / pooled_height) % channels; 76 int n = index / pooled_width / pooled_height / channels; 77 int hstart = ph * stride_h - pad_t; 78 int wstart = pw * stride_w - pad_l; 79 int hend = min(hstart + kernel_h, height); 80 int wend = min(wstart + kernel_w, width); 81 hstart = max(hstart, 0); 82 wstart = max(wstart, 0); 83 dtype maxval = Eigen::NumTraits<dtype>::lowest(); 84 int maxidx = -1; 85 const dtype* bottom_data_n = bottom_data + n * channels * height * width; 86 for (int h = hstart; h < hend; ++h) { 87 for (int w = wstart; w < wend; ++w) { 88 int idx = c * height * width + h * width + w; 89 if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) { 90 maxidx = idx; 91 maxval = bottom_data_n[idx]; 92 } 93 } 94 } 95 top_data[index] = maxval; 96 if (mask != nullptr) { 97 mask[index] = maxidx; 98 } 99 } 100 } 101 102 // The parameters for MaxPoolForwardNoMaskKernel_NCHW_VECT_C are the same as for 103 // MaxPoolForwardNCHW above, except that mask is not supported, and each 104 // element of the input and output contains 4 adjacent channel values for 105 // the same X, y coordinate. 106 // (so channels = outer_channels, output_size = real output size / 4). 107 __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C( 108 const int nthreads, const int32* bottom_data, const int height, 109 const int width, const int channels, const int pooled_height, 110 const int pooled_width, const int kernel_h, const int kernel_w, 111 const int stride_h, const int stride_w, const int pad_t, const int pad_l, 112 int32* top_data) { 113 // TODO(pauldonnelly): Implement a better optimized version of this kernel. 114 const int32 kMinINT8X4 = 0x80808080; 115 CUDA_1D_KERNEL_LOOP(index, nthreads) { 116 int pw = index % pooled_width; 117 int ph = (index / pooled_width) % pooled_height; 118 int c = (index / pooled_width / pooled_height) % channels; 119 int n = index / pooled_width / pooled_height / channels; 120 int hstart = ph * stride_h - pad_t; 121 int wstart = pw * stride_w - pad_l; 122 int hend = min(hstart + kernel_h, height); 123 int wend = min(wstart + kernel_w, width); 124 hstart = max(hstart, 0); 125 wstart = max(wstart, 0); 126 int32 maxval = kMinINT8X4; 127 const int32* bottom_data_n = bottom_data + n * channels * height * width; 128 for (int h = hstart; h < hend; ++h) { 129 for (int w = wstart; w < wend; ++w) { 130 int idx = (c * height + h) * width + w; 131 maxval = __vmaxs4(maxval, bottom_data_n[idx]); 132 } 133 } 134 top_data[index] = maxval; 135 } 136 } 137 138 template <bool propagate_nans, typename dtype> 139 __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data, 140 const int height, const int width, 141 const int channels, const int pooled_height, 142 const int pooled_width, const int kernel_h, 143 const int kernel_w, const int stride_h, 144 const int stride_w, const int pad_t, 145 const int pad_l, dtype* top_data, 146 int64* mask) { 147 CUDA_1D_KERNEL_LOOP(index, nthreads) { 148 int n = index; 149 int c = n % channels; 150 n /= channels; 151 int wstart = (n % pooled_width) * stride_w - pad_l; 152 n /= pooled_width; 153 int hstart = (n % pooled_height) * stride_h - pad_t; 154 n /= pooled_height; 155 int hend = min(hstart + kernel_h, height); 156 int wend = min(wstart + kernel_w, width); 157 hstart = max(hstart, 0); 158 wstart = max(wstart, 0); 159 dtype maxval = Eigen::NumTraits<dtype>::lowest(); 160 int maxidx = -1; 161 const dtype* bottom_data_n = bottom_data + n * height * width * channels; 162 for (int h = hstart; h < hend; ++h) { 163 for (int w = wstart; w < wend; ++w) { 164 int idx = (h * width + w) * channels + c; 165 if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) { 166 maxidx = idx; 167 maxval = bottom_data_n[idx]; 168 } 169 } 170 } 171 top_data[index] = maxval; 172 if (mask != nullptr) { 173 mask[index] = maxidx; 174 } 175 } 176 } 177 178 template <typename dtype> 179 __global__ void MaxPoolBackwardNoMaskNHWC( 180 const int nthreads, const dtype* bottom_data, const int height, 181 const int width, const int channels, const int pooled_height, 182 const int pooled_width, const int kernel_h, const int kernel_w, 183 const int stride_h, const int stride_w, const int pad_t, const int pad_l, 184 const dtype* top_diff, dtype* bottom_diff) { 185 CUDA_1D_KERNEL_LOOP(index, nthreads) { 186 // First find out the index to the maximum, since we have no mask. 187 int n = index; 188 int c = n % channels; 189 n /= channels; 190 int wstart = (n % pooled_width) * stride_w - pad_l; 191 n /= pooled_width; 192 int hstart = (n % pooled_height) * stride_h - pad_t; 193 n /= pooled_height; 194 int hend = min(hstart + kernel_h, height); 195 int wend = min(wstart + kernel_w, width); 196 hstart = max(hstart, 0); 197 wstart = max(wstart, 0); 198 dtype maxval = Eigen::NumTraits<dtype>::lowest(); 199 int maxidx = -1; 200 const dtype* bottom_data_n = bottom_data + n * height * width * channels; 201 for (int h = hstart; h < hend; ++h) { 202 for (int w = wstart; w < wend; ++w) { 203 int idx = (h * width + w) * channels + c; 204 if (bottom_data_n[idx] > maxval) { 205 maxidx = idx; 206 maxval = bottom_data_n[idx]; 207 } 208 } 209 } 210 211 // Atomically accumulate the bottom diff. The index could still be 212 // uninitialized, if all the bottom_data are NaN. 213 if (maxidx != -1) { 214 CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx, 215 top_diff[index]); 216 } 217 } 218 } 219 220 // The parameters to the kernels in the backward function is as follows: 221 // nthreads: the number of threads, which is equal to the output size. 222 // top_diff: the gradient of the output data, of size N*Hout*Wout*C (or 223 // N*C*Hout*Wout). As we have stored the flattened index of the input 224 // entries, the backward function is agnostic of the input storage order. 225 // mask: the output mask of the same size as top_data. It is stored in 226 // int form, keeping track of the flattened index of the input item that 227 // produces the max output. 228 // top_offset: the pre-computed per-image offset of the maxpool output. This 229 // is equal to Hout*Wout*C. We choose to pre-compute this so we do not 230 // need to compute it every time inside the kernel. 231 // bottom_offset: the pre-computed per-image offset of the maxpool input. 232 // This is equal to H*W*C. 233 // bottom_diff: the gradient with respect to the input. 234 // This function relies on CudaAtomicAdd to avoid race conditions. Also, before 235 // the kernel is run, you will need to make sure that bottom_diff is filled with 236 // zero first. 237 template <typename dtype> 238 __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff, 239 const int64* mask, const int top_offset, 240 const int bottom_offset, dtype* bottom_diff) { 241 CUDA_1D_KERNEL_LOOP(index, nthreads) { 242 int image_id = (index / top_offset); 243 CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index], 244 top_diff[index]); 245 } 246 } 247 248 // The parameters to the kernels in the gradient gradient function is as 249 // follows: 250 // nthreads: the number of threads, which is equal to the output size. The 251 // gradient of the MaxPooling gradient w.r.t. the output data has a 252 // dimensions of N*C*Hout*Wout 253 // bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items. 254 // output_data: the output data of N*Hout*Wout*C (or N*C*Hout*Wout) items. 255 // height, width, pooled_height, pooled_width: the input and output sizes. 256 // kernel_h, kernel_w: the kernel sizes. 257 // stride_h, stride_w: the strides. 258 // pad_t, pad_l: the padding values on the top and left side. 259 // top_diff: the gradient of the gradient of the output data w.r.t. the 260 // input data, of size N*H*W*C (or N*C*H*W). 261 // bottom_diff: the gradient of the gradient w.r.t. output. 262 template <typename dtype> 263 __global__ void MaxPoolGradBackwardNoMaskNCHW( 264 const int nthreads, const dtype* bottom_data, const dtype* output_data, 265 const int pooled_height, const int pooled_width, const int channels, 266 const int height, const int width, const int kernel_h, const int kernel_w, 267 const int stride_h, const int stride_w, const int pad_t, const int pad_l, 268 const dtype* top_diff, dtype* bottom_diff) { 269 CUDA_1D_KERNEL_LOOP(index, nthreads) { 270 // First find out the index to the maximum, since we have no mask. 271 int pw = index % pooled_width; 272 int ph = (index / pooled_width) % pooled_height; 273 int c = (index / pooled_width / pooled_height) % channels; 274 int n = index / pooled_width / pooled_height / channels; 275 int hstart = ph * stride_h - pad_t; 276 int wstart = pw * stride_w - pad_l; 277 const int hend = min(hstart + kernel_h, height); 278 const int wend = min(wstart + kernel_w, width); 279 hstart = max(hstart, 0); 280 wstart = max(wstart, 0); 281 bool should_stop = false; 282 int maxidx = -1; 283 const dtype* bottom_data_n = bottom_data + n * channels * height * width; 284 // Propagate only first value from top_diff corresponding to the maximum. 285 for (int h = hstart; h < hend && !should_stop; ++h) { 286 for (int w = wstart; w < wend && !should_stop; ++w) { 287 int idx = c * height * width + h * width + w; 288 if (output_data[index] == bottom_data_n[idx]) { 289 maxidx = idx; 290 should_stop = true; 291 } 292 } 293 } 294 // Set the bottom diff (atomic is not necessary). The index could still be 295 // uninitialized, if all the bottom_data are NaN. 296 if (maxidx != -1) { 297 bottom_diff[index] = top_diff[n * channels * height * width + maxidx]; 298 } 299 } 300 } 301 302 template <typename dtype> 303 __global__ void MaxPoolGradBackwardNoMaskNHWC( 304 const int nthreads, const dtype* bottom_data, const dtype* output_data, 305 const int pooled_height, const int pooled_width, const int channels, 306 const int height, const int width, const int kernel_h, const int kernel_w, 307 const int stride_h, const int stride_w, const int pad_t, const int pad_l, 308 const dtype* top_diff, dtype* bottom_diff) { 309 CUDA_1D_KERNEL_LOOP(index, nthreads) { 310 // First find out the index to the maximum, since we have no mask. 311 int n = index; 312 int c = n % channels; 313 n /= channels; 314 int wstart = (n % pooled_width) * stride_w - pad_l; 315 n /= pooled_width; 316 int hstart = (n % pooled_height) * stride_h - pad_t; 317 n /= pooled_height; 318 int hend = min(hstart + kernel_h, height); 319 int wend = min(wstart + kernel_w, width); 320 hstart = max(hstart, 0); 321 wstart = max(wstart, 0); 322 bool should_stop = false; 323 int maxidx = -1; 324 const dtype* bottom_data_n = bottom_data + n * height * width * channels; 325 // Propagate only first value from top_diff corresponding to the maximum. 326 for (int h = hstart; h < hend && !should_stop; ++h) { 327 for (int w = wstart; w < wend && !should_stop; ++w) { 328 int idx = (h * width + w) * channels + c; 329 if (output_data[index] == bottom_data_n[idx]) { 330 maxidx = idx; 331 should_stop = true; 332 } 333 } 334 } 335 // Set the bottom diff (atomic is not necessary). The index could still be 336 // uninitialized, if all the bottom_data are NaN. 337 if (maxidx != -1) { 338 bottom_diff[index] = top_diff[n * height * width * channels + maxidx]; 339 } 340 } 341 } 342 343 // The parameters to the kernels in the gradient gradient function is as 344 // follows: 345 // nthreads: the number of threads, which is equal to the output size. The 346 // gradient of the MaxPooling gradient w.r.t. the output data has a 347 // dimensions of N*C*Hout*Wout 348 // top_diff: the gradient of the gradient of the output data w.r.t. the 349 // input data, of size N*H*W*C (or N*C*H*W). As we have stored the 350 // flattened index of the input entries, the backward function is 351 // agnostic of the input storage order. 352 // mask: the output mask of the same size as top_data. It is stored in 353 // int form, keeping track of the flattened index of the input item that 354 // produces the max output. 355 // top_offset: the pre-computed per-image offset of the maxpool input 356 // gradient. This is equal to H*W*C. We choose to pre-compute this so we 357 // do not need to compute it every time inside the kernel. 358 // bottom_offset: the pre-computed per-image offset of the maxpool output. 359 // This is equal to Hout*Wout*C. 360 // bottom_diff: the gradient of the gradient w.r.t. output. 361 template <typename dtype> 362 __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff, 363 const int64* mask, const int top_offset, 364 const int bottom_offset, 365 dtype* bottom_diff) { 366 CUDA_1D_KERNEL_LOOP(index, nthreads) { 367 int image_id = (index / bottom_offset); 368 bottom_diff[index] = top_diff[image_id * top_offset + mask[index]]; 369 } 370 } 371 372 #undef CUDA_1D_KERNEL_LOOP 373 } // namespace 374 375 namespace functor { 376 377 // Note: channels is the outer channels (dim 1) which has already been 378 // divided by 4. 379 bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()( 380 const int32* bottom_data, const int batch, const int height, 381 const int width, int channels, const int pooled_height, 382 const int pooled_width, const int kernel_h, const int kernel_w, 383 const int stride_h, const int stride_w, const int pad_t, const int pad_l, 384 int32* top_data, const Eigen::GpuDevice& d) { 385 const int kThreadsPerBlock = 1024; 386 const int output_size = batch * channels * pooled_height * pooled_width; 387 MaxPoolForwardNoMaskKernel_NCHW_VECT_C<<< 388 (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 389 0, d.stream()>>>(output_size, bottom_data, height, width, channels, 390 pooled_height, pooled_width, kernel_h, kernel_w, 391 stride_h, stride_w, pad_t, pad_l, top_data); 392 d.synchronize(); 393 return d.ok(); 394 } 395 396 template <typename T> 397 bool MaxPoolForwardWithOptionalArgmax<T>::operator()( 398 const T* bottom_data, const int batch, const int height, const int width, 399 const int channels, const int pooled_height, const int pooled_width, 400 const int kernel_h, const int kernel_w, const int stride_h, 401 const int stride_w, const int pad_t, const int pad_l, T* top_data, 402 int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) { 403 const int kThreadsPerBlock = 1024; 404 const int output_size = batch * channels * pooled_height * pooled_width; 405 if (propagate_nans) { 406 MaxPoolForwardNHWC<true> 407 <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, 408 kThreadsPerBlock, 0, d.stream()>>>( 409 output_size, bottom_data, height, width, channels, pooled_height, 410 pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, 411 top_data, mask); 412 } else { 413 MaxPoolForwardNHWC<false> 414 <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, 415 kThreadsPerBlock, 0, d.stream()>>>( 416 output_size, bottom_data, height, width, channels, pooled_height, 417 pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, 418 top_data, mask); 419 } 420 return d.ok(); 421 } 422 423 template <typename T> 424 bool MaxPoolBackwardNoMask<T>::operator()( 425 const T* bottom_data, const int batch, const int height, const int width, 426 const int channels, const int pooled_height, const int pooled_width, 427 const int kernel_h, const int kernel_w, const int stride_h, 428 const int stride_w, const int pad_t, const int pad_l, const T* top_diff, 429 T* bottom_diff, const Eigen::GpuDevice& d) { 430 const int kThreadsPerBlock = 1024; 431 432 const int bottom_size = batch * channels * height * width; 433 SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock, 434 kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff); 435 436 const int top_size = batch * channels * pooled_height * pooled_width; 437 MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) / 438 kThreadsPerBlock, 439 kThreadsPerBlock, 0, d.stream()>>>( 440 top_size, bottom_data, height, width, channels, pooled_height, 441 pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, 442 top_diff, bottom_diff); 443 return d.ok(); 444 } 445 446 template <typename T> 447 bool MaxPoolBackwardWithArgmax<T>::operator()( 448 const int output_size, const int input_size, const T* top_diff, 449 const int64* mask, const int top_offset, const int bottom_offset, 450 T* bottom_diff, const Eigen::GpuDevice& d) { 451 const int kThreadsPerBlock = 1024; 452 SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock, 453 kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); 454 MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, 455 kThreadsPerBlock, 0, d.stream()>>>( 456 output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); 457 return d.ok(); 458 } 459 460 template <typename T> 461 bool MaxPoolGradBackwardNoMask<T>::operator()( 462 TensorFormat data_format, const T* bottom_data, const T* output_data, 463 const int batch, const int pooled_height, const int pooled_width, 464 const int channels, const int height, const int width, const int kernel_h, 465 const int kernel_w, const int stride_h, const int stride_w, const int pad_t, 466 const int pad_l, const T* top_diff, T* bottom_diff, 467 const Eigen::GpuDevice& d) { 468 const int num_kernels = batch * channels * pooled_height * pooled_width; 469 CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); 470 471 if (data_format == FORMAT_NHWC) { 472 MaxPoolGradBackwardNoMaskNHWC<<<config.block_count, config.thread_per_block, 473 0, d.stream()>>>( 474 num_kernels, bottom_data, output_data, pooled_height, pooled_width, 475 channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t, 476 pad_l, top_diff, bottom_diff); 477 } else { 478 MaxPoolGradBackwardNoMaskNCHW<<<config.block_count, config.thread_per_block, 479 0, d.stream()>>>( 480 num_kernels, bottom_data, output_data, pooled_height, pooled_width, 481 channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t, 482 pad_l, top_diff, bottom_diff); 483 } 484 return d.ok(); 485 } 486 487 template <typename T> 488 bool MaxPoolGradBackwardWithArgmax<T>::operator()( 489 const int output_size, const int input_size, const T* top_diff, 490 const int64* mask, const int top_offset, const int bottom_offset, 491 T* bottom_diff, const Eigen::GpuDevice& d) { 492 CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d); 493 MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0, 494 d.stream()>>>(output_size, top_diff, mask, top_offset, 495 bottom_offset, bottom_diff); 496 return d.ok(); 497 } 498 499 typedef Eigen::GpuDevice GPUDevice; 500 501 #define DEFINE_GPU_KERNELS(T) \ 502 template struct SpatialMaxPooling<GPUDevice, T>; \ 503 template struct MaxPoolForwardWithOptionalArgmax<T>; \ 504 template struct MaxPoolBackwardWithArgmax<T>; \ 505 template struct MaxPoolBackwardNoMask<T>; \ 506 template struct MaxPoolGradBackwardWithArgmax<T>; \ 507 template struct MaxPoolGradBackwardNoMask<T>; 508 509 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); 510 511 #undef DEFINE_GPU_KERNELS 512 513 } // namespace functor 514 515 } // end namespace tensorflow 516 517 #endif // GOOGLE_CUDA 518