Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #define EIGEN_USE_GPU
     19 
     20 #include <stdio.h>
     21 #include <cfloat>
     22 
     23 #include "tensorflow/core/framework/register_types.h"
     24 #include "tensorflow/core/framework/tensor_types.h"
     25 #include "tensorflow/core/framework/type_traits.h"
     26 #include "tensorflow/core/kernels/maxpooling_op.h"
     27 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
     28 #include "tensorflow/core/util/cuda_kernel_helper.h"
     29 
     30 namespace tensorflow {
     31 namespace {
     32 template <bool propagate_nans, typename dtype>
     33 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
     34   if (propagate_nans) {
     35     return !(a <= b);
     36   } else {
     37     return a > b;
     38   }
     39 }
     40 
     41 // This is Yangqing's custom kernel for the maxpooling operation. There are
     42 // three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two
     43 // forward functions, dealing with the forward case. MaxPoolBackward is the
     44 // backward function that deals with the backward case for both storage orders.
     45 // The parameters to the kernels in the forward function is as follows:
     46 //     nthreads: the number of threads, which is equal to the output size.
     47 //     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
     48 //     height, width, pooled_height, pooled_width: the input and output sizes.
     49 //     kernel_h, kernel_w: the kernel sizes.
     50 //     stride_h, stride_w: the strides.
     51 //     pad_t, pad_l: the padding values on the top and left side.
     52 //     top_data: the maxpool output.
     53 //     mask: the output mask of the same size as top_data. It is stored in
     54 //         int form, keeping track of the flattened index of the input item that
     55 //         produces the max output. If a nullptr is passed in for mask, no mask
     56 //         will be produced.
     57 //
     58 // To call the forward and backward functions, use e.g.:
     59 // const int kThreadsPerBlock = 1024
     60 // const int output_size = batch * channels * pooled_height * pooled_width;
     61 // MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
     62 //                      kThreadsPerBlock, 0, cuda_stream>>>(...);
     63 template <bool propagate_nans, typename dtype>
     64 __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
     65                                    const int channels, const int height,
     66                                    const int width, const int pooled_height,
     67                                    const int pooled_width, const int kernel_h,
     68                                    const int kernel_w, const int stride_h,
     69                                    const int stride_w, const int pad_t,
     70                                    const int pad_l, dtype* top_data,
     71                                    int64* mask) {
     72   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     73     int pw = index % pooled_width;
     74     int ph = (index / pooled_width) % pooled_height;
     75     int c = (index / pooled_width / pooled_height) % channels;
     76     int n = index / pooled_width / pooled_height / channels;
     77     int hstart = ph * stride_h - pad_t;
     78     int wstart = pw * stride_w - pad_l;
     79     int hend = min(hstart + kernel_h, height);
     80     int wend = min(wstart + kernel_w, width);
     81     hstart = max(hstart, 0);
     82     wstart = max(wstart, 0);
     83     dtype maxval = Eigen::NumTraits<dtype>::lowest();
     84     int maxidx = -1;
     85     const dtype* bottom_data_n = bottom_data + n * channels * height * width;
     86     for (int h = hstart; h < hend; ++h) {
     87       for (int w = wstart; w < wend; ++w) {
     88         int idx = c * height * width + h * width + w;
     89         if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
     90           maxidx = idx;
     91           maxval = bottom_data_n[idx];
     92         }
     93       }
     94     }
     95     top_data[index] = maxval;
     96     if (mask != nullptr) {
     97       mask[index] = maxidx;
     98     }
     99   }
    100 }
    101 
    102 // The parameters for MaxPoolForwardNoMaskKernel_NCHW_VECT_C are the same as for
    103 // MaxPoolForwardNCHW above, except that mask is not supported, and each
    104 // element of the input and output contains 4 adjacent channel values for
    105 // the same X, y coordinate.
    106 // (so channels = outer_channels, output_size = real output size / 4).
    107 __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
    108     const int nthreads, const int32* bottom_data, const int height,
    109     const int width, const int channels, const int pooled_height,
    110     const int pooled_width, const int kernel_h, const int kernel_w,
    111     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    112     int32* top_data) {
    113   // TODO(pauldonnelly): Implement a better optimized version of this kernel.
    114   const int32 kMinINT8X4 = 0x80808080;
    115   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    116     int pw = index % pooled_width;
    117     int ph = (index / pooled_width) % pooled_height;
    118     int c = (index / pooled_width / pooled_height) % channels;
    119     int n = index / pooled_width / pooled_height / channels;
    120     int hstart = ph * stride_h - pad_t;
    121     int wstart = pw * stride_w - pad_l;
    122     int hend = min(hstart + kernel_h, height);
    123     int wend = min(wstart + kernel_w, width);
    124     hstart = max(hstart, 0);
    125     wstart = max(wstart, 0);
    126     int32 maxval = kMinINT8X4;
    127     const int32* bottom_data_n = bottom_data + n * channels * height * width;
    128     for (int h = hstart; h < hend; ++h) {
    129       for (int w = wstart; w < wend; ++w) {
    130         int idx = (c * height + h) * width + w;
    131         maxval = __vmaxs4(maxval, bottom_data_n[idx]);
    132       }
    133     }
    134     top_data[index] = maxval;
    135   }
    136 }
    137 
    138 template <bool propagate_nans, typename dtype>
    139 __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
    140                                    const int height, const int width,
    141                                    const int channels, const int pooled_height,
    142                                    const int pooled_width, const int kernel_h,
    143                                    const int kernel_w, const int stride_h,
    144                                    const int stride_w, const int pad_t,
    145                                    const int pad_l, dtype* top_data,
    146                                    int64* mask) {
    147   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    148     int n = index;
    149     int c = n % channels;
    150     n /= channels;
    151     int wstart = (n % pooled_width) * stride_w - pad_l;
    152     n /= pooled_width;
    153     int hstart = (n % pooled_height) * stride_h - pad_t;
    154     n /= pooled_height;
    155     int hend = min(hstart + kernel_h, height);
    156     int wend = min(wstart + kernel_w, width);
    157     hstart = max(hstart, 0);
    158     wstart = max(wstart, 0);
    159     dtype maxval = Eigen::NumTraits<dtype>::lowest();
    160     int maxidx = -1;
    161     const dtype* bottom_data_n = bottom_data + n * height * width * channels;
    162     for (int h = hstart; h < hend; ++h) {
    163       for (int w = wstart; w < wend; ++w) {
    164         int idx = (h * width + w) * channels + c;
    165         if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
    166           maxidx = idx;
    167           maxval = bottom_data_n[idx];
    168         }
    169       }
    170     }
    171     top_data[index] = maxval;
    172     if (mask != nullptr) {
    173       mask[index] = maxidx;
    174     }
    175   }
    176 }
    177 
    178 template <typename dtype>
    179 __global__ void MaxPoolBackwardNoMaskNHWC(
    180     const int nthreads, const dtype* bottom_data, const int height,
    181     const int width, const int channels, const int pooled_height,
    182     const int pooled_width, const int kernel_h, const int kernel_w,
    183     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    184     const dtype* top_diff, dtype* bottom_diff) {
    185   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    186     // First find out the index to the maximum, since we have no mask.
    187     int n = index;
    188     int c = n % channels;
    189     n /= channels;
    190     int wstart = (n % pooled_width) * stride_w - pad_l;
    191     n /= pooled_width;
    192     int hstart = (n % pooled_height) * stride_h - pad_t;
    193     n /= pooled_height;
    194     int hend = min(hstart + kernel_h, height);
    195     int wend = min(wstart + kernel_w, width);
    196     hstart = max(hstart, 0);
    197     wstart = max(wstart, 0);
    198     dtype maxval = Eigen::NumTraits<dtype>::lowest();
    199     int maxidx = -1;
    200     const dtype* bottom_data_n = bottom_data + n * height * width * channels;
    201     for (int h = hstart; h < hend; ++h) {
    202       for (int w = wstart; w < wend; ++w) {
    203         int idx = (h * width + w) * channels + c;
    204         if (bottom_data_n[idx] > maxval) {
    205           maxidx = idx;
    206           maxval = bottom_data_n[idx];
    207         }
    208       }
    209     }
    210 
    211     // Atomically accumulate the bottom diff. The index could still be
    212     // uninitialized, if all the bottom_data are NaN.
    213     if (maxidx != -1) {
    214       CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
    215                     top_diff[index]);
    216     }
    217   }
    218 }
    219 
    220 // The parameters to the kernels in the backward function is as follows:
    221 //     nthreads: the number of threads, which is equal to the output size.
    222 //     top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
    223 //        N*C*Hout*Wout). As we have stored the flattened index of the input
    224 //        entries, the backward function is agnostic of the input storage order.
    225 //     mask: the output mask of the same size as top_data. It is stored in
    226 //         int form, keeping track of the flattened index of the input item that
    227 //         produces the max output.
    228 //     top_offset: the pre-computed per-image offset of the maxpool output. This
    229 //         is equal to Hout*Wout*C. We choose to pre-compute this so we do not
    230 //         need to compute it every time inside the kernel.
    231 //     bottom_offset: the pre-computed per-image offset of the maxpool input.
    232 //         This is equal to H*W*C.
    233 //     bottom_diff: the gradient with respect to the input.
    234 // This function relies on CudaAtomicAdd to avoid race conditions. Also, before
    235 // the kernel is run, you will need to make sure that bottom_diff is filled with
    236 // zero first.
    237 template <typename dtype>
    238 __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
    239                                 const int64* mask, const int top_offset,
    240                                 const int bottom_offset, dtype* bottom_diff) {
    241   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    242     int image_id = (index / top_offset);
    243     CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
    244                   top_diff[index]);
    245   }
    246 }
    247 
    248 // The parameters to the kernels in the gradient gradient function is as
    249 // follows:
    250 //     nthreads: the number of threads, which is equal to the output size. The
    251 //         gradient of the MaxPooling gradient w.r.t. the output data has a
    252 //         dimensions of N*C*Hout*Wout
    253 //     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
    254 //     output_data: the output data of N*Hout*Wout*C (or N*C*Hout*Wout) items.
    255 //     height, width, pooled_height, pooled_width: the input and output sizes.
    256 //     kernel_h, kernel_w: the kernel sizes.
    257 //     stride_h, stride_w: the strides.
    258 //     pad_t, pad_l: the padding values on the top and left side.
    259 //     top_diff: the gradient of the gradient of the output data w.r.t. the
    260 //         input data, of size N*H*W*C (or N*C*H*W).
    261 //     bottom_diff: the gradient of the gradient w.r.t. output.
    262 template <typename dtype>
    263 __global__ void MaxPoolGradBackwardNoMaskNCHW(
    264     const int nthreads, const dtype* bottom_data, const dtype* output_data,
    265     const int pooled_height, const int pooled_width, const int channels,
    266     const int height, const int width, const int kernel_h, const int kernel_w,
    267     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    268     const dtype* top_diff, dtype* bottom_diff) {
    269   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    270     // First find out the index to the maximum, since we have no mask.
    271     int pw = index % pooled_width;
    272     int ph = (index / pooled_width) % pooled_height;
    273     int c = (index / pooled_width / pooled_height) % channels;
    274     int n = index / pooled_width / pooled_height / channels;
    275     int hstart = ph * stride_h - pad_t;
    276     int wstart = pw * stride_w - pad_l;
    277     const int hend = min(hstart + kernel_h, height);
    278     const int wend = min(wstart + kernel_w, width);
    279     hstart = max(hstart, 0);
    280     wstart = max(wstart, 0);
    281     bool should_stop = false;
    282     int maxidx = -1;
    283     const dtype* bottom_data_n = bottom_data + n * channels * height * width;
    284     // Propagate only first value from top_diff corresponding to the maximum.
    285     for (int h = hstart; h < hend && !should_stop; ++h) {
    286       for (int w = wstart; w < wend && !should_stop; ++w) {
    287         int idx = c * height * width + h * width + w;
    288         if (output_data[index] == bottom_data_n[idx]) {
    289           maxidx = idx;
    290           should_stop = true;
    291         }
    292       }
    293     }
    294     // Set the bottom diff (atomic is not necessary). The index could still be
    295     // uninitialized, if all the bottom_data are NaN.
    296     if (maxidx != -1) {
    297       bottom_diff[index] = top_diff[n * channels * height * width + maxidx];
    298     }
    299   }
    300 }
    301 
    302 template <typename dtype>
    303 __global__ void MaxPoolGradBackwardNoMaskNHWC(
    304     const int nthreads, const dtype* bottom_data, const dtype* output_data,
    305     const int pooled_height, const int pooled_width, const int channels,
    306     const int height, const int width, const int kernel_h, const int kernel_w,
    307     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    308     const dtype* top_diff, dtype* bottom_diff) {
    309   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    310     // First find out the index to the maximum, since we have no mask.
    311     int n = index;
    312     int c = n % channels;
    313     n /= channels;
    314     int wstart = (n % pooled_width) * stride_w - pad_l;
    315     n /= pooled_width;
    316     int hstart = (n % pooled_height) * stride_h - pad_t;
    317     n /= pooled_height;
    318     int hend = min(hstart + kernel_h, height);
    319     int wend = min(wstart + kernel_w, width);
    320     hstart = max(hstart, 0);
    321     wstart = max(wstart, 0);
    322     bool should_stop = false;
    323     int maxidx = -1;
    324     const dtype* bottom_data_n = bottom_data + n * height * width * channels;
    325     // Propagate only first value from top_diff corresponding to the maximum.
    326     for (int h = hstart; h < hend && !should_stop; ++h) {
    327       for (int w = wstart; w < wend && !should_stop; ++w) {
    328         int idx = (h * width + w) * channels + c;
    329         if (output_data[index] == bottom_data_n[idx]) {
    330           maxidx = idx;
    331           should_stop = true;
    332         }
    333       }
    334     }
    335     // Set the bottom diff (atomic is not necessary). The index could still be
    336     // uninitialized, if all the bottom_data are NaN.
    337     if (maxidx != -1) {
    338       bottom_diff[index] = top_diff[n * height * width * channels + maxidx];
    339     }
    340   }
    341 }
    342 
    343 // The parameters to the kernels in the gradient gradient function is as
    344 // follows:
    345 //     nthreads: the number of threads, which is equal to the output size. The
    346 //         gradient of the MaxPooling gradient w.r.t. the output data has a
    347 //         dimensions of N*C*Hout*Wout
    348 //     top_diff: the gradient of the gradient of the output data w.r.t. the
    349 //         input data, of size N*H*W*C (or N*C*H*W). As we have stored the
    350 //         flattened index of the input entries, the backward function is
    351 //         agnostic of the input storage order.
    352 //     mask: the output mask of the same size as top_data. It is stored in
    353 //         int form, keeping track of the flattened index of the input item that
    354 //         produces the max output.
    355 //     top_offset: the pre-computed per-image offset of the maxpool input
    356 //         gradient. This is equal to H*W*C. We choose to pre-compute this so we
    357 //         do not  need to compute it every time inside the kernel.
    358 //     bottom_offset: the pre-computed per-image offset of the maxpool output.
    359 //         This is equal to Hout*Wout*C.
    360 //     bottom_diff: the gradient of the gradient w.r.t. output.
    361 template <typename dtype>
    362 __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
    363                                     const int64* mask, const int top_offset,
    364                                     const int bottom_offset,
    365                                     dtype* bottom_diff) {
    366   CUDA_1D_KERNEL_LOOP(index, nthreads) {
    367     int image_id = (index / bottom_offset);
    368     bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
    369   }
    370 }
    371 
    372 #undef CUDA_1D_KERNEL_LOOP
    373 }  // namespace
    374 
    375 namespace functor {
    376 
    377 // Note: channels is the outer channels (dim 1) which has already been
    378 // divided by 4.
    379 bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
    380     const int32* bottom_data, const int batch, const int height,
    381     const int width, int channels, const int pooled_height,
    382     const int pooled_width, const int kernel_h, const int kernel_w,
    383     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    384     int32* top_data, const Eigen::GpuDevice& d) {
    385   const int kThreadsPerBlock = 1024;
    386   const int output_size = batch * channels * pooled_height * pooled_width;
    387   MaxPoolForwardNoMaskKernel_NCHW_VECT_C<<<
    388       (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
    389       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
    390                        pooled_height, pooled_width, kernel_h, kernel_w,
    391                        stride_h, stride_w, pad_t, pad_l, top_data);
    392   d.synchronize();
    393   return d.ok();
    394 }
    395 
    396 template <typename T>
    397 bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
    398     const T* bottom_data, const int batch, const int height, const int width,
    399     const int channels, const int pooled_height, const int pooled_width,
    400     const int kernel_h, const int kernel_w, const int stride_h,
    401     const int stride_w, const int pad_t, const int pad_l, T* top_data,
    402     int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
    403   const int kThreadsPerBlock = 1024;
    404   const int output_size = batch * channels * pooled_height * pooled_width;
    405   if (propagate_nans) {
    406     MaxPoolForwardNHWC<true>
    407         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
    408            kThreadsPerBlock, 0, d.stream()>>>(
    409             output_size, bottom_data, height, width, channels, pooled_height,
    410             pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
    411             top_data, mask);
    412   } else {
    413     MaxPoolForwardNHWC<false>
    414         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
    415            kThreadsPerBlock, 0, d.stream()>>>(
    416             output_size, bottom_data, height, width, channels, pooled_height,
    417             pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
    418             top_data, mask);
    419   }
    420   return d.ok();
    421 }
    422 
    423 template <typename T>
    424 bool MaxPoolBackwardNoMask<T>::operator()(
    425     const T* bottom_data, const int batch, const int height, const int width,
    426     const int channels, const int pooled_height, const int pooled_width,
    427     const int kernel_h, const int kernel_w, const int stride_h,
    428     const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
    429     T* bottom_diff, const Eigen::GpuDevice& d) {
    430   const int kThreadsPerBlock = 1024;
    431 
    432   const int bottom_size = batch * channels * height * width;
    433   SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
    434             kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
    435 
    436   const int top_size = batch * channels * pooled_height * pooled_width;
    437   MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
    438                                   kThreadsPerBlock,
    439                               kThreadsPerBlock, 0, d.stream()>>>(
    440       top_size, bottom_data, height, width, channels, pooled_height,
    441       pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
    442       top_diff, bottom_diff);
    443   return d.ok();
    444 }
    445 
    446 template <typename T>
    447 bool MaxPoolBackwardWithArgmax<T>::operator()(
    448     const int output_size, const int input_size, const T* top_diff,
    449     const int64* mask, const int top_offset, const int bottom_offset,
    450     T* bottom_diff, const Eigen::GpuDevice& d) {
    451   const int kThreadsPerBlock = 1024;
    452   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
    453             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
    454   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
    455                     kThreadsPerBlock, 0, d.stream()>>>(
    456       output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
    457   return d.ok();
    458 }
    459 
    460 template <typename T>
    461 bool MaxPoolGradBackwardNoMask<T>::operator()(
    462     TensorFormat data_format, const T* bottom_data, const T* output_data,
    463     const int batch, const int pooled_height, const int pooled_width,
    464     const int channels, const int height, const int width, const int kernel_h,
    465     const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
    466     const int pad_l, const T* top_diff, T* bottom_diff,
    467     const Eigen::GpuDevice& d) {
    468   const int num_kernels = batch * channels * pooled_height * pooled_width;
    469   CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
    470 
    471   if (data_format == FORMAT_NHWC) {
    472     MaxPoolGradBackwardNoMaskNHWC<<<config.block_count, config.thread_per_block,
    473                                     0, d.stream()>>>(
    474         num_kernels, bottom_data, output_data, pooled_height, pooled_width,
    475         channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
    476         pad_l, top_diff, bottom_diff);
    477   } else {
    478     MaxPoolGradBackwardNoMaskNCHW<<<config.block_count, config.thread_per_block,
    479                                     0, d.stream()>>>(
    480         num_kernels, bottom_data, output_data, pooled_height, pooled_width,
    481         channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
    482         pad_l, top_diff, bottom_diff);
    483   }
    484   return d.ok();
    485 }
    486 
    487 template <typename T>
    488 bool MaxPoolGradBackwardWithArgmax<T>::operator()(
    489     const int output_size, const int input_size, const T* top_diff,
    490     const int64* mask, const int top_offset, const int bottom_offset,
    491     T* bottom_diff, const Eigen::GpuDevice& d) {
    492   CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
    493   MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
    494                         d.stream()>>>(output_size, top_diff, mask, top_offset,
    495                                       bottom_offset, bottom_diff);
    496   return d.ok();
    497 }
    498 
    499 typedef Eigen::GpuDevice GPUDevice;
    500 
    501 #define DEFINE_GPU_KERNELS(T)                          \
    502   template struct SpatialMaxPooling<GPUDevice, T>;     \
    503   template struct MaxPoolForwardWithOptionalArgmax<T>; \
    504   template struct MaxPoolBackwardWithArgmax<T>;        \
    505   template struct MaxPoolBackwardNoMask<T>;            \
    506   template struct MaxPoolGradBackwardWithArgmax<T>;    \
    507   template struct MaxPoolGradBackwardNoMask<T>;
    508 
    509 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
    510 
    511 #undef DEFINE_GPU_KERNELS
    512 
    513 }  // namespace functor
    514 
    515 }  // end namespace tensorflow
    516 
    517 #endif  // GOOGLE_CUDA
    518