Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/nn_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #include "tensorflow/core/kernels/maxpooling_op.h"
     21 
     22 #include <vector>
     23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     24 #include "tensorflow/core/common_runtime/device.h"
     25 #include "tensorflow/core/framework/numeric_op.h"
     26 #include "tensorflow/core/framework/op_kernel.h"
     27 #include "tensorflow/core/framework/register_types.h"
     28 #include "tensorflow/core/framework/tensor.h"
     29 #include "tensorflow/core/framework/tensor_shape.h"
     30 #include "tensorflow/core/framework/tensor_slice.h"
     31 #include "tensorflow/core/kernels/conv_2d.h"
     32 #include "tensorflow/core/kernels/eigen_pooling.h"
     33 #include "tensorflow/core/kernels/ops_util.h"
     34 #include "tensorflow/core/kernels/pooling_ops_common.h"
     35 #include "tensorflow/core/lib/core/errors.h"
     36 #include "tensorflow/core/lib/gtl/array_slice.h"
     37 #include "tensorflow/core/util/env_var.h"
     38 #include "tensorflow/core/util/padding.h"
     39 #include "tensorflow/core/util/tensor_format.h"
     40 #include "tensorflow/core/util/use_cudnn.h"
     41 
     42 #if GOOGLE_CUDA
     43 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
     44 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
     45 #include "tensorflow/core/platform/stream_executor.h"
     46 #endif  // GOOGLE_CUDA
     47 
     48 namespace tensorflow {
     49 
     50 typedef Eigen::ThreadPoolDevice CPUDevice;
     51 typedef Eigen::GpuDevice GPUDevice;
     52 
     53 const int kInvalidMaxPoolingIndex = -1;
     54 
     55 template <typename Device, typename T>
     56 static void SpatialMaxPoolWithArgMaxHelper(
     57     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
     58     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
     59     const PoolParameters& params, const Padding& padding) {
     60   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
     61       ConstEigenMatrixMap;
     62   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
     63       EigenMatrixMap;
     64   typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>>
     65       EigenIndexMatrixMap;
     66 
     67   ConstEigenMatrixMap in_mat(
     68       tensor_in.flat<T>().data(), params.depth,
     69       params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
     70   EigenMatrixMap out_mat(
     71       output->flat<T>().data(), params.depth,
     72       params.out_width * params.out_height * params.tensor_in_batch);
     73   EigenIndexMatrixMap out_arg_max_mat(
     74       output_arg_max->flat<int64>().data(), params.depth,
     75       params.out_width * params.out_height * params.tensor_in_batch);
     76 
     77   const DeviceBase::CpuWorkerThreads& worker_threads =
     78       *(context->device()->tensorflow_cpu_worker_threads());
     79 
     80   // The following code basically does the following:
     81   // 1. Flattens the input and output tensors into two dimensional arrays.
     82   //    tensor_in_as_matrix:
     83   //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
     84   //    output_as_matrix:
     85   //      depth by (out_width * out_height * tensor_in_batch)
     86   //
     87   // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
     88   //    and updates the corresponding column(s) in output_as_matrix with the
     89   //    max value.
     90   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
     91                 &output_arg_max, &out_backprop](int64 start, int64 limit) {
     92     const int32 depth = params.depth;
     93     const int32 in_rows = params.tensor_in_rows;
     94     const int32 in_cols = params.tensor_in_cols;
     95     const int32 pad_rows = params.pad_rows;
     96     const int32 pad_cols = params.pad_cols;
     97     const int32 window_rows = params.window_rows;
     98     const int32 window_cols = params.window_cols;
     99     const int32 row_stride = params.row_stride;
    100     const int32 col_stride = params.col_stride;
    101     const int32 out_height = params.out_height;
    102     const int32 out_width = params.out_width;
    103 
    104     {
    105       // Initializes the output tensor with MIN<T>.
    106       const int32 output_image_size = out_height * out_width * depth;
    107       EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1,
    108                                (limit - start) * output_image_size);
    109       out_shard.setConstant(Eigen::NumTraits<T>::lowest());
    110       EigenIndexMatrixMap out_arg_max_shard(
    111           out_arg_max_mat.data() + start * output_image_size, 1,
    112           (limit - start) * output_image_size);
    113       out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex);
    114     }
    115 
    116     for (int64 b = start; b < limit; ++b) {
    117       for (int h = 0; h < in_rows; ++h) {
    118         for (int w = 0; w < in_cols; ++w) {
    119           // (h_start, h_end) * (w_start, w_end) is the range that the input
    120           // vector projects to.
    121           const int hpad = h + pad_rows;
    122           const int wpad = w + pad_cols;
    123           const int h_start =
    124               (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
    125           const int h_end = std::min(hpad / row_stride + 1, out_height);
    126           const int w_start =
    127               (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1;
    128           const int w_end = std::min(wpad / col_stride + 1, out_width);
    129           // compute elementwise max
    130           const int64 in_index = (b * in_rows + h) * in_cols + w;
    131           for (int ph = h_start; ph < h_end; ++ph) {
    132             const int64 out_index_base = (b * out_height + ph) * out_width;
    133             for (int pw = w_start; pw < w_end; ++pw) {
    134               const int64 out_index = out_index_base + pw;
    135               /// NOTES(zhengxq): not using the eigen matrix operation for
    136               /// now.
    137               for (int d = 0; d < depth; ++d) {
    138                 const T& input_ref = in_mat.coeffRef(d, in_index);
    139                 T& output_ref = out_mat.coeffRef(d, out_index);
    140                 int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
    141                 if (output_ref < input_ref ||
    142                     out_arg_max_ref == kInvalidMaxPoolingIndex) {
    143                   output_ref = input_ref;
    144                   int64 input_offset = in_index * depth + d;
    145                   out_arg_max_ref = input_offset;
    146                 }
    147               }
    148             }
    149           }
    150         }
    151       }
    152     }
    153 
    154     {
    155       auto input_backprop_flat = input_backprop->flat<T>();
    156       auto out_arg_max_flat = output_arg_max->flat<int64>();
    157       auto out_backprop_flat = out_backprop.flat<T>();
    158 
    159       // Initialize output to 0.
    160       const int64 in_size = in_rows * in_cols * depth;
    161       const int64 in_start = start * in_size;
    162       const int64 in_end = limit * in_size;
    163       EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
    164                               in_end - in_start);
    165       in_shard.setConstant(T(0));
    166 
    167       // Backpropagate.
    168       const int out_size = out_height * out_width * depth;
    169       const int out_start = start * out_size;
    170       const int out_end = limit * out_size;
    171       for (int index = out_start; index < out_end; ++index) {
    172         int input_backprop_index = out_arg_max_flat(index);
    173         // Although this check is in the inner loop, it is worth its value
    174         // so we don't end up with memory corruptions. Our benchmark shows that
    175         // the performance impact is quite small
    176         CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
    177             << "Invalid input backprop index: " << input_backprop_index << ", "
    178             << in_start << ", " << in_end;
    179         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
    180       }
    181     }
    182   };
    183 
    184   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
    185                            params.depth * params.window_rows *
    186                            params.window_cols;
    187   Shard(worker_threads.num_threads, worker_threads.workers,
    188         params.tensor_in_batch, shard_cost, shard);
    189 }
    190 
    191 // The operation to compute MaxPool gradients.
    192 // It takes three inputs:
    193 //   - The original input tensor
    194 //   - The original output tensor
    195 //   - Backprop tensor for output
    196 // It produces one output: backprop tensor for input.
    197 template <class Device, class T>
    198 class MaxPoolingGradOp : public OpKernel {
    199  public:
    200   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
    201     string data_format;
    202     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    203     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    204                 errors::InvalidArgument("Invalid data format"));
    205     OP_REQUIRES(
    206         context, data_format_ == FORMAT_NHWC,
    207         errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
    208                                 "on device type ",
    209                                 DeviceTypeString(context->device_type())));
    210 
    211     if (context->num_inputs() == 3) {
    212       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    213       OP_REQUIRES(context, ksize_.size() == 4,
    214                   errors::InvalidArgument("Sliding window ksize field must "
    215                                           "specify 4 dimensions"));
    216       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    217       OP_REQUIRES(context, stride_.size() == 4,
    218                   errors::InvalidArgument("Sliding window strides field must "
    219                                           "specify 4 dimensions"));
    220       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    221                   errors::Unimplemented(
    222                       "Pooling is not yet supported on the batch dimension."));
    223       OP_REQUIRES(
    224           context, ksize_[3] == 1 && stride_[3] == 1,
    225           errors::Unimplemented(
    226               "MaxPoolingGrad is not yet supported on the depth dimension."));
    227     }
    228 
    229     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    230   }
    231 
    232   void Compute(OpKernelContext* context) override {
    233     const Tensor& tensor_in = context->input(0);
    234     const Tensor& tensor_out = context->input(1);
    235     const Tensor& out_backprop = context->input(2);
    236 
    237     // For maxpooling, tensor_in should have 4 dimensions.
    238     OP_REQUIRES(context, tensor_in.dims() == 4,
    239                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
    240     OP_REQUIRES(context, tensor_out.dims() == 4,
    241                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
    242     // For maxpooling, out_backprop should have 4 dimensions.
    243     OP_REQUIRES(context, out_backprop.dims() == 4,
    244                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
    245 
    246     const TensorShape& output_shape = tensor_in.shape();
    247 
    248     Tensor tensor_out_dup;
    249     OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
    250                                 {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
    251                                 &tensor_out_dup));
    252     Tensor tensor_out_arg_max;
    253     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
    254                                                    tensor_out.shape(),
    255                                                    &tensor_out_arg_max));
    256     std::vector<int32> ksize = ksize_;
    257     std::vector<int32> stride = stride_;
    258     if (context->num_inputs() == 5) {
    259       const Tensor& tensor_ksize = context->input(3);
    260       auto value_ksize = tensor_ksize.flat<int32>();
    261       ksize.resize(tensor_ksize.shape().num_elements());
    262       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
    263 
    264       const Tensor& tensor_stride = context->input(4);
    265       auto value_stride = tensor_stride.flat<int32>();
    266       stride.resize(tensor_stride.shape().num_elements());
    267       std::copy_n(&value_stride(0), stride.size(), stride.begin());
    268     }
    269 
    270     OP_REQUIRES(context, ksize.size() == 4,
    271                 errors::InvalidArgument("Sliding window ksize field must "
    272                                         "specify 4 dimensions"));
    273     OP_REQUIRES(context, stride.size() == 4,
    274                 errors::InvalidArgument("Sliding window strides field must "
    275                                         "specify 4 dimensions"));
    276     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
    277                 errors::Unimplemented(
    278                     "Pooling is not yet supported on the batch dimension."));
    279     OP_REQUIRES(
    280         context, ksize[3] == 1 && stride[3] == 1,
    281         errors::Unimplemented(
    282             "MaxPoolingGrad is not yet supported on the depth dimension."));
    283 
    284     PoolParameters params{context,  ksize,       stride,
    285                           padding_, FORMAT_NHWC, tensor_in.shape()};
    286     if (!context->status().ok()) {
    287       return;
    288     }
    289 
    290     Tensor* output = nullptr;
    291     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
    292                                 {0}, 0, output_shape, &output));
    293 
    294     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
    295         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
    296         out_backprop, params, padding_);
    297   }
    298 
    299  private:
    300   std::vector<int32> ksize_;
    301   std::vector<int32> stride_;
    302   Padding padding_;
    303   TensorFormat data_format_;
    304 };
    305 
    306 #ifdef GOOGLE_CUDA
    307 
    308 template <typename T>
    309 static void MaxPoolingBackwardCustomKernel(
    310     OpKernelContext* context, const std::vector<int32>& size,
    311     const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
    312     const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
    313   Tensor* output = nullptr;
    314   OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
    315                               {0}, 0, tensor_in_shape, &output));
    316 
    317   PoolParameters params{context, size,        stride,
    318                         padding, FORMAT_NHWC, tensor_in_shape};
    319   if (!context->status().ok()) {
    320     return;
    321   }
    322 
    323   functor::MaxPoolBackwardNoMask<T>()(
    324       tensor_in->flat<T>().data(), params.tensor_in_batch,
    325       params.tensor_in_rows, params.tensor_in_cols, params.depth,
    326       params.out_height, params.out_width, params.window_rows,
    327       params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
    328       params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
    329       context->eigen_device<Eigen::GpuDevice>());
    330 }
    331 
    332 template <class T>
    333 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
    334  public:
    335   typedef Eigen::GpuDevice Device;
    336 
    337   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
    338     string data_format;
    339     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    340     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    341                 errors::InvalidArgument("Invalid data format"));
    342     if (context->num_inputs() == 3) {
    343       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    344       OP_REQUIRES(context, ksize_.size() == 4,
    345                   errors::InvalidArgument("Sliding window ksize field must "
    346                                           "specify 4 dimensions"));
    347       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    348       OP_REQUIRES(context, stride_.size() == 4,
    349                   errors::InvalidArgument("Sliding window strides field must "
    350                                           "specify 4 dimensions"));
    351       const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
    352       const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
    353       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
    354                   errors::Unimplemented(
    355                       "Pooling is not yet supported on the batch dimension."));
    356     }
    357     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    358 
    359     use_dnn_ = CanUseCudnn();
    360     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
    361                                    &propagate_nans_));
    362   }
    363 
    364   void Compute(OpKernelContext* context) override {
    365     const Tensor& tensor_in = context->input(0);
    366     const Tensor& tensor_out = context->input(1);
    367     const Tensor& out_backprop = context->input(2);
    368 
    369     // For maxpooling, tensor_in should have 4 dimensions.
    370     OP_REQUIRES(context, tensor_in.dims() == 4,
    371                 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
    372     OP_REQUIRES(context, tensor_out.dims() == 4,
    373                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
    374     // For maxpooling, out_backprop should have 4 dimensions.
    375     OP_REQUIRES(context, out_backprop.dims() == 4,
    376                 errors::InvalidArgument("out_backprop must be 4-dimensional"));
    377 
    378     TensorShape output_shape = tensor_in.shape();
    379 
    380     std::vector<int32> ksize = ksize_;
    381     std::vector<int32> stride = stride_;
    382     if (context->num_inputs() == 5) {
    383       const Tensor& tensor_ksize = context->input(3);
    384       auto value_ksize = tensor_ksize.flat<int32>();
    385       ksize.resize(tensor_ksize.shape().num_elements());
    386       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
    387 
    388       const Tensor& tensor_stride = context->input(4);
    389       auto value_stride = tensor_stride.flat<int32>();
    390       stride.resize(tensor_stride.shape().num_elements());
    391       std::copy_n(&value_stride(0), stride.size(), stride.begin());
    392     }
    393     OP_REQUIRES(context, ksize.size() == 4,
    394                 errors::InvalidArgument("Sliding window ksize field must "
    395                                         "specify 4 dimensions"));
    396     OP_REQUIRES(context, stride.size() == 4,
    397                 errors::InvalidArgument("Sliding window strides field must "
    398                                         "specify 4 dimensions"));
    399     const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
    400     const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
    401     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
    402                 errors::Unimplemented(
    403                     "Pooling is not yet supported on the batch dimension."));
    404 
    405     if (use_dnn_) {
    406       DnnPoolingGradOp<T>::Compute(
    407           context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
    408           stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
    409           output_shape, propagate_nans_);
    410     } else {
    411       CHECK(data_format_ == FORMAT_NHWC)
    412           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
    413       MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
    414                                         &tensor_in, out_backprop, output_shape);
    415     }
    416   }
    417 
    418  private:
    419   std::vector<int32> ksize_;
    420   std::vector<int32> stride_;
    421   Padding padding_;
    422   TensorFormat data_format_;
    423   bool use_dnn_;
    424   bool propagate_nans_;
    425 };
    426 
    427 #endif  // GOOGLE_CUDA
    428 
    429 // The operation to compute gradient of MaxPool gradients.
    430 // It takes three inputs:
    431 //   - The original input tensor
    432 //   - The original output tensor
    433 //   - Backprop tensor for output gradients
    434 // It produces one output: backprop tensor for output gradient.
    435 template <class Device, class T>
    436 class MaxPoolingGradGradOp : public OpKernel {
    437  public:
    438   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
    439       : OpKernel(context) {
    440     string data_format;
    441     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    442     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    443                 errors::InvalidArgument("Invalid data format"));
    444     OP_REQUIRES(
    445         context, data_format_ == FORMAT_NHWC,
    446         errors::InvalidArgument(
    447             "Default MaxPoolingGradGradOp only supports NHWC ",
    448             "on device type ", DeviceTypeString(context->device_type())));
    449 
    450     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    451 
    452     if (context->num_inputs() == 3) {
    453       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    454       OP_REQUIRES(context, ksize_.size() == 4,
    455                   errors::InvalidArgument("Sliding window ksize field must "
    456                                           "specify 4 dimensions"));
    457       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    458       OP_REQUIRES(context, stride_.size() == 4,
    459                   errors::InvalidArgument("Sliding window strides field must "
    460                                           "specify 4 dimensions"));
    461       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    462                   errors::Unimplemented(
    463                       "Pooling is not yet supported on the batch dimension."));
    464       OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1,
    465                   errors::Unimplemented("MaxPoolingGradGrad is not yet "
    466                                         "supported on the depth dimension."));
    467     }
    468   }
    469 
    470   void Compute(OpKernelContext* context) override {
    471     const Tensor& tensor_in = context->input(0);
    472     const Tensor& tensor_out = context->input(1);
    473     const Tensor& out_grad_backprop = context->input(2);
    474 
    475     // For maxpooling, tensor_in should have 4 dimensions.
    476     OP_REQUIRES(context, tensor_in.dims() == 4,
    477                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
    478     OP_REQUIRES(context, tensor_out.dims() == 4,
    479                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
    480     // For maxpooling, out_grad_backprop should have 4 dimensions.
    481     OP_REQUIRES(
    482         context, out_grad_backprop.dims() == 4,
    483         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
    484 
    485     std::vector<int32> ksize = ksize_;
    486     std::vector<int32> stride = stride_;
    487     if (context->num_inputs() == 5) {
    488       const Tensor& tensor_ksize = context->input(3);
    489       auto value_ksize = tensor_ksize.flat<int32>();
    490       ksize.resize(tensor_ksize.shape().num_elements());
    491       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
    492 
    493       const Tensor& tensor_stride = context->input(4);
    494       auto value_stride = tensor_stride.flat<int32>();
    495       stride.resize(tensor_stride.shape().num_elements());
    496       std::copy_n(&value_stride(0), stride.size(), stride.begin());
    497     }
    498 
    499     OP_REQUIRES(context, ksize.size() == 4,
    500                 errors::InvalidArgument("Sliding window ksize field must "
    501                                         "specify 4 dimensions"));
    502     OP_REQUIRES(context, stride.size() == 4,
    503                 errors::InvalidArgument("Sliding window strides field must "
    504                                         "specify 4 dimensions"));
    505     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
    506                 errors::Unimplemented(
    507                     "Pooling is not yet supported on the batch dimension."));
    508     OP_REQUIRES(
    509         context, ksize[3] == 1 && stride[3] == 1,
    510         errors::Unimplemented(
    511             "MaxPoolingGrad is not yet supported on the depth dimension."));
    512 
    513     PoolParameters params{context,  ksize,       stride,
    514                           padding_, FORMAT_NHWC, tensor_in.shape()};
    515     Tensor* output = nullptr;
    516     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
    517                                 {2}, 0, tensor_out.shape(), &output));
    518 
    519     SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
    520                            out_grad_backprop, params, padding_);
    521   }
    522 
    523  private:
    524   void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
    525                               const Tensor& tensor_in, const Tensor& tensor_out,
    526                               const Tensor& top_diff,
    527                               const PoolParameters& params,
    528                               const Padding& padding) {
    529     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
    530         ConstEigenMatrixMap;
    531     typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
    532         EigenMatrixMap;
    533 
    534     ConstEigenMatrixMap in_mat(
    535         tensor_in.flat<T>().data(), params.depth,
    536         params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
    537     ConstEigenMatrixMap out_mat(
    538         tensor_out.flat<T>().data(), params.depth,
    539         params.out_width * params.out_height * params.tensor_in_batch);
    540     ConstEigenMatrixMap top_diff_mat(
    541         top_diff.flat<T>().data(), params.depth,
    542         params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
    543     EigenMatrixMap bottom_diff_mat(
    544         bottom_diff->flat<T>().data(), params.depth,
    545         params.out_width * params.out_height * params.tensor_in_batch);
    546 
    547     const DeviceBase::CpuWorkerThreads& worker_threads =
    548         *(context->device()->tensorflow_cpu_worker_threads());
    549 
    550     // The following code basically does the following:
    551     // 1. Flattens the input, output, top_diff and bottom_diff tensors into
    552     //    two dimensional arrays.
    553     //    tensor_in_as_matrix:
    554     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
    555     //    tensor_out_as_matrix:
    556     //      depth by (out_width * out_height * tensor_in_batch)
    557     //    top_diff_as_matrix:
    558     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
    559     //    bottom_diff_as_matrix:
    560     //      depth by (out_width * out_height * tensor_in_batch)
    561     //
    562     // 2. Walks through the set of columns in the flattened
    563     //    tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
    564     //    and updates the column(s) corresponding to the maximum values in
    565     //    tensor_out_as_matrix with the corresponding values in
    566     //    top_diff_as_matrix.
    567     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
    568                      int64 start, int64 limit) {
    569       const int32 depth = params.depth;
    570       const int32 in_rows = params.tensor_in_rows;
    571       const int32 in_cols = params.tensor_in_cols;
    572       const int32 pad_rows = params.pad_rows;
    573       const int32 pad_cols = params.pad_cols;
    574       const int32 window_rows = params.window_rows;
    575       const int32 window_cols = params.window_cols;
    576       const int32 row_stride = params.row_stride;
    577       const int32 col_stride = params.col_stride;
    578       const int32 out_height = params.out_height;
    579       const int32 out_width = params.out_width;
    580 
    581       {
    582         // Initializes the output grad backprop tensor with 0.
    583         const int32 output_image_size = out_height * out_width * params.depth;
    584         EigenMatrixMap bottom_diff_shard(
    585             bottom_diff_mat.data() + start * output_image_size, 1,
    586             (limit - start) * output_image_size);
    587         bottom_diff_shard.setZero();
    588       }
    589 
    590       for (int b = start; b < limit; ++b) {
    591         for (int ph = 0; ph < out_height; ++ph) {
    592           for (int pw = 0; pw < out_width; ++pw) {
    593             // (h_start, h_end) * (w_start, w_end) is the range that the input
    594             // vector projects to.
    595             int h_start = ph * row_stride - pad_rows;
    596             const int h_end = std::min(h_start + window_rows, in_rows);
    597             int w_start = pw * col_stride - pad_cols;
    598             const int w_end = std::min(w_start + window_cols, in_cols);
    599             h_start = std::max(h_start, 0);
    600             w_start = std::max(w_start, 0);
    601             const int out_index = (b * out_height + ph) * out_width + pw;
    602             // Find value corresponding to the input maximum in top_diff.
    603             for (int d = 0; d < depth; ++d) {
    604               const T& output_ref = out_mat.coeffRef(d, out_index);
    605               bool should_stop = false;
    606               for (int h = h_start; h < h_end && !should_stop; ++h) {
    607                 for (int w = w_start; w < w_end && !should_stop; ++w) {
    608                   const int in_index = (b * in_rows + h) * in_cols + w;
    609                   const T& input_ref = in_mat.coeffRef(d, in_index);
    610                   if (output_ref == input_ref) {
    611                     T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
    612                     bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
    613                     should_stop = true;
    614                   }
    615                 }
    616               }
    617             }
    618           }
    619         }
    620       }
    621     };
    622 
    623     const int64 shard_cost = params.out_width * params.out_height *
    624                              params.depth * params.window_rows *
    625                              params.window_cols;
    626     Shard(worker_threads.num_threads, worker_threads.workers,
    627           params.tensor_in_batch, shard_cost, shard);
    628   }
    629 
    630   std::vector<int32> ksize_;
    631   std::vector<int32> stride_;
    632   Padding padding_;
    633   TensorFormat data_format_;
    634 };
    635 
    636 #ifdef GOOGLE_CUDA
    637 
    638 template <class T>
    639 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
    640  public:
    641   typedef Eigen::GpuDevice Device;
    642 
    643   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
    644       : OpKernel(context) {
    645     string data_format;
    646     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    647     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    648                 errors::InvalidArgument("Invalid data format"));
    649     if (context->num_inputs() == 3) {
    650       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    651       OP_REQUIRES(context, ksize_.size() == 4,
    652                   errors::InvalidArgument("Sliding window ksize field must "
    653                                           "specify 4 dimensions"));
    654       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    655       OP_REQUIRES(context, stride_.size() == 4,
    656                   errors::InvalidArgument("Sliding window strides field must "
    657                                           "specify 4 dimensions"));
    658       const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
    659       const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
    660       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
    661                   errors::Unimplemented(
    662                       "Pooling is not yet supported on the batch dimension."));
    663     }
    664     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    665   }
    666 
    667   void Compute(OpKernelContext* context) override {
    668     const Tensor& tensor_in = context->input(0);
    669     const Tensor& tensor_out = context->input(1);
    670     const Tensor& out_grad_backprop = context->input(2);
    671 
    672     // For maxpooling, tensor_in should have 4 dimensions.
    673     OP_REQUIRES(context, tensor_in.dims() == 4,
    674                 errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
    675     OP_REQUIRES(context, tensor_out.dims() == 4,
    676                 errors::InvalidArgument("tensor_out must be 4-dimensional"));
    677     // For maxpooling, out_grad_backprop should have 4 dimensions.
    678     OP_REQUIRES(
    679         context, out_grad_backprop.dims() == 4,
    680         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
    681 
    682     Tensor* output = nullptr;
    683     OP_REQUIRES_OK(context,
    684                    context->allocate_output(0, tensor_out.shape(), &output));
    685 
    686     std::vector<int32> ksize = ksize_;
    687     std::vector<int32> stride = stride_;
    688     if (context->num_inputs() == 5) {
    689       const Tensor& tensor_ksize = context->input(3);
    690       auto value_ksize = tensor_ksize.flat<int32>();
    691       ksize.resize(tensor_ksize.shape().num_elements());
    692       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
    693 
    694       const Tensor& tensor_stride = context->input(4);
    695       auto value_stride = tensor_stride.flat<int32>();
    696       stride.resize(tensor_stride.shape().num_elements());
    697       std::copy_n(&value_stride(0), stride.size(), stride.begin());
    698     }
    699 
    700     OP_REQUIRES(context, ksize.size() == 4,
    701                 errors::InvalidArgument("Sliding window ksize field must "
    702                                         "specify 4 dimensions"));
    703     OP_REQUIRES(context, stride.size() == 4,
    704                 errors::InvalidArgument("Sliding window strides field must "
    705                                         "specify 4 dimensions"));
    706     const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
    707     const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
    708     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
    709                 errors::Unimplemented(
    710                     "Pooling is not yet supported on the batch dimension."));
    711 
    712     PoolParameters params{context,  ksize,        stride,
    713                           padding_, data_format_, tensor_in.shape()};
    714 
    715     functor::MaxPoolGradBackwardNoMask<T>()(
    716         data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
    717         params.tensor_in_batch, params.out_height, params.out_width,
    718         params.depth, params.tensor_in_rows, params.tensor_in_cols,
    719         params.window_rows, params.window_cols, params.row_stride,
    720         params.col_stride, params.pad_rows, params.pad_cols,
    721         out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
    722         context->eigen_device<Eigen::GpuDevice>());
    723   }
    724 
    725  private:
    726   std::vector<int32> ksize_;
    727   std::vector<int32> stride_;
    728   Padding padding_;
    729   TensorFormat data_format_;
    730   bool use_dnn_;
    731 };
    732 
    733 #endif  // GOOGLE_CUDA
    734 
    735 template <typename Device, typename T>
    736 struct LaunchMaxPoolingNoMask;
    737 
    738 template <typename Device, typename T>
    739 class MaxPoolingNoMaskOp : public OpKernel {
    740  public:
    741   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
    742       : OpKernel(context) {
    743     string data_format;
    744     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    745     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    746                 errors::InvalidArgument("Invalid data format"));
    747     OP_REQUIRES(
    748         context, data_format_ == FORMAT_NHWC,
    749         errors::InvalidArgument(
    750             "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
    751             DeviceTypeString(context->device_type())));
    752     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    753     OP_REQUIRES(context, ksize_.size() == 4,
    754                 errors::InvalidArgument("Sliding window ksize field must "
    755                                         "specify 4 dimensions"));
    756     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    757     OP_REQUIRES(context, stride_.size() == 4,
    758                 errors::InvalidArgument("Sliding window stride field must "
    759                                         "specify 4 dimensions"));
    760     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    761     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    762                 errors::Unimplemented(
    763                     "Pooling is not yet supported on the batch dimension."));
    764   }
    765 
    766   void Compute(OpKernelContext* context) override {
    767     const Tensor& tensor_in = context->input(0);
    768 
    769     PoolParameters params{context,  ksize_,       stride_,
    770                           padding_, data_format_, tensor_in.shape()};
    771     if (!context->status().ok()) {
    772       return;
    773     }
    774 
    775     TensorShape out_shape({params.tensor_in_batch, params.out_height,
    776                            params.out_width, params.depth});
    777     Tensor* output = nullptr;
    778     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
    779 
    780     LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
    781                                               output);
    782   }
    783 
    784  private:
    785   std::vector<int32> ksize_;
    786   std::vector<int32> stride_;
    787   Padding padding_;
    788   TensorFormat data_format_;
    789 };
    790 
    791 template <typename Device, typename T>
    792 class MaxPoolingNoMaskV2Op : public OpKernel {
    793  public:
    794   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
    795       : OpKernel(context) {
    796     string data_format;
    797     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    798     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    799                 errors::InvalidArgument("Invalid data format"));
    800     OP_REQUIRES(
    801         context, data_format_ == FORMAT_NHWC,
    802         errors::InvalidArgument(
    803             "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
    804             DeviceTypeString(context->device_type())));
    805     if (context->num_inputs() == 1) {
    806       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    807       OP_REQUIRES(context, ksize_.size() == 4,
    808                   errors::InvalidArgument("Sliding window ksize field must "
    809                                           "specify 4 dimensions"));
    810       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    811       OP_REQUIRES(context, stride_.size() == 4,
    812                   errors::InvalidArgument("Sliding window stride field must "
    813                                           "specify 4 dimensions"));
    814       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    815                   errors::Unimplemented(
    816                       "Pooling is not yet supported on the batch dimension."));
    817     }
    818     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    819   }
    820 
    821   void Compute(OpKernelContext* context) override {
    822     const Tensor& tensor_in = context->input(0);
    823 
    824     std::vector<int32> ksize = ksize_;
    825     std::vector<int32> stride = stride_;
    826 
    827     if (context->num_inputs() != 1) {
    828       const Tensor& tensor_ksize = context->input(1);
    829       auto value_ksize = tensor_ksize.flat<int32>();
    830       ksize.resize(tensor_ksize.shape().num_elements());
    831       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
    832 
    833       const Tensor& tensor_stride = context->input(2);
    834       auto value_stride = tensor_stride.flat<int32>();
    835       stride.resize(tensor_stride.shape().num_elements());
    836       std::copy_n(&value_stride(0), stride.size(), stride.begin());
    837     }
    838     OP_REQUIRES(context, ksize.size() == 4,
    839                 errors::InvalidArgument("Sliding window ksize field must "
    840                                         "specify 4 dimensions"));
    841     OP_REQUIRES(context, stride.size() == 4,
    842                 errors::InvalidArgument("Sliding window stride field must "
    843                                         "specify 4 dimensions"));
    844     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
    845                 errors::Unimplemented(
    846                     "Pooling is not yet supported on the batch dimension."));
    847     PoolParameters params{context,  ksize,        stride,
    848                           padding_, data_format_, tensor_in.shape()};
    849     if (!context->status().ok()) {
    850       return;
    851     }
    852 
    853     TensorShape out_shape({params.tensor_in_batch, params.out_height,
    854                            params.out_width, params.depth});
    855     Tensor* output = nullptr;
    856     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
    857 
    858     LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
    859                                               output);
    860   }
    861 
    862  private:
    863   std::vector<int32> ksize_;
    864   std::vector<int32> stride_;
    865   Padding padding_;
    866   TensorFormat data_format_;
    867 };
    868 
    869 template <typename Device, typename T>
    870 struct LaunchMaxPoolingWithArgmax;
    871 
    872 template <typename Device, typename T>
    873 class MaxPoolingWithArgmaxOp : public OpKernel {
    874  public:
    875   explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
    876       : OpKernel(context) {
    877     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    878     OP_REQUIRES(context, ksize_.size() == 4,
    879                 errors::InvalidArgument("Sliding window ksize field must "
    880                                         "specify 4 dimensions"));
    881     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    882     OP_REQUIRES(context, stride_.size() == 4,
    883                 errors::InvalidArgument("Sliding window stride field must "
    884                                         "specify 4 dimensions"));
    885     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    886     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    887                 errors::Unimplemented(
    888                     "Pooling is not yet supported on the batch dimension."));
    889 
    890     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
    891                                    &propagate_nans_));
    892   }
    893 
    894   void Compute(OpKernelContext* context) override {
    895     const Tensor& tensor_in = context->input(0);
    896 
    897     PoolParameters params{context,  ksize_,      stride_,
    898                           padding_, FORMAT_NHWC, tensor_in.shape()};
    899     if (!context->status().ok()) {
    900       return;
    901     }
    902 
    903     TensorShape out_shape({params.tensor_in_batch, params.out_height,
    904                            params.out_width, params.depth});
    905     Tensor* output = nullptr;
    906     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
    907     Tensor* argmax = nullptr;
    908     OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
    909 
    910     LaunchMaxPoolingWithArgmax<Device, T>::launch(
    911         context, params, tensor_in, output, argmax, propagate_nans_);
    912   }
    913 
    914  private:
    915   std::vector<int32> ksize_;
    916   std::vector<int32> stride_;
    917   Padding padding_;
    918   bool propagate_nans_;
    919 };
    920 
    921 template <typename Device, typename T>
    922 struct LaunchMaxPoolingGradWithArgmax;
    923 
    924 template <typename Device, typename T>
    925 class MaxPoolingGradWithArgmaxOp : public OpKernel {
    926  public:
    927   explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
    928       : OpKernel(context) {
    929     string data_format_str;
    930     auto status = context->GetAttr("data_format", &data_format_str);
    931     if (status.ok()) {
    932       OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
    933                   errors::InvalidArgument("Invalid data format"));
    934     }
    935 
    936     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    937     OP_REQUIRES(context, ksize_.size() == 4,
    938                 errors::InvalidArgument("Sliding window ksize field must "
    939                                         "specify 4 dimensions"));
    940     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    941     OP_REQUIRES(context, stride_.size() == 4,
    942                 errors::InvalidArgument("Sliding window stride field must "
    943                                         "specify 4 dimensions"));
    944     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    945     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    946                 errors::Unimplemented(
    947                     "Pooling is not yet supported on the batch dimension."));
    948   }
    949 
    950   void Compute(OpKernelContext* context) override {
    951     const Tensor& tensor_in = context->input(0);
    952     const Tensor& grad_in = context->input(1);
    953     const Tensor& argmax = context->input(2);
    954 
    955     PoolParameters params{context,  ksize_,      stride_,
    956                           padding_, FORMAT_NHWC, tensor_in.shape()};
    957     if (!context->status().ok()) {
    958       return;
    959     }
    960 
    961     TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
    962                            params.tensor_in_cols, params.depth});
    963     Tensor* grad_out = nullptr;
    964     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
    965                                 {1}, 0, out_shape, &grad_out));
    966 
    967     LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
    968                                                       argmax, grad_out);
    969   }
    970 
    971  private:
    972   std::vector<int32> ksize_;
    973   std::vector<int32> stride_;
    974   Padding padding_;
    975   TensorFormat data_format_;
    976 };
    977 
    978 template <typename Device, typename T>
    979 struct LaunchMaxPoolingGradGradWithArgmax;
    980 
    981 template <typename Device, typename T>
    982 class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
    983  public:
    984   explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
    985       : OpKernel(context) {
    986     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
    987     OP_REQUIRES(context, ksize_.size() == 4,
    988                 errors::InvalidArgument("Sliding window ksize field must "
    989                                         "specify 4 dimensions"));
    990     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
    991     OP_REQUIRES(context, stride_.size() == 4,
    992                 errors::InvalidArgument("Sliding window stride field must "
    993                                         "specify 4 dimensions"));
    994     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    995     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
    996                 errors::Unimplemented(
    997                     "Pooling is not yet supported on the batch dimension."));
    998   }
    999 
   1000   void Compute(OpKernelContext* context) override {
   1001     const Tensor& tensor_in = context->input(0);
   1002     const Tensor& grad_in = context->input(1);
   1003     const Tensor& argmax = context->input(2);
   1004 
   1005     PoolParameters params{context,  ksize_,      stride_,
   1006                           padding_, FORMAT_NHWC, tensor_in.shape()};
   1007     if (!context->status().ok()) {
   1008       return;
   1009     }
   1010 
   1011     TensorShape out_shape({params.tensor_in_batch, params.out_height,
   1012                            params.out_width, params.depth});
   1013 
   1014     Tensor* grad_out = nullptr;
   1015     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
   1016                                 {1}, 0, out_shape, &grad_out));
   1017 
   1018     LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
   1019         context, params, grad_in, argmax, grad_out);
   1020   }
   1021 
   1022  private:
   1023   std::vector<int32> ksize_;
   1024   std::vector<int32> stride_;
   1025   Padding padding_;
   1026 };
   1027 
   1028 #if GOOGLE_CUDA
   1029 template <typename T>
   1030 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   1031  public:
   1032   typedef GPUDevice Device;
   1033   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
   1034       : OpKernel(context) {
   1035     string data_format;
   1036     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
   1037     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
   1038                 errors::InvalidArgument("Invalid data format"));
   1039     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
   1040     OP_REQUIRES(context, ksize_.size() == 4,
   1041                 errors::InvalidArgument("Sliding window ksize field must "
   1042                                         "specify 4 dimensions"));
   1043     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
   1044     OP_REQUIRES(context, stride_.size() == 4,
   1045                 errors::InvalidArgument("Sliding window stride field must "
   1046                                         "specify 4 dimensions"));
   1047     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   1048     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
   1049     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
   1050     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
   1051                 errors::Unimplemented(
   1052                     "Pooling is not yet supported on the batch dimension."));
   1053     use_dnn_ = CanUseCudnn();
   1054 
   1055     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
   1056                                    &propagate_nans_));
   1057   }
   1058 
   1059   void Compute(OpKernelContext* context) override {
   1060     const Tensor& tensor_in = context->input(0);
   1061 
   1062     PoolParameters params{context,  ksize_,       stride_,
   1063                           padding_, data_format_, tensor_in.shape()};
   1064     if (!context->status().ok()) {
   1065       return;
   1066     }
   1067 
   1068     TensorShape out_shape =
   1069         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
   1070                         params.out_width, params.depth);
   1071 
   1072     // Assuming qint8 <--> NCHW_VECT_C (int8x4) here.
   1073     constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
   1074     OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
   1075                 errors::InvalidArgument(
   1076                     "qint8 should be used with data_format NCHW_VECT_C."));
   1077 
   1078     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
   1079     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
   1080       DnnPoolingOp<T>::Compute(context,
   1081                                perftools::gputools::dnn::PoolingMode::kMaximum,
   1082                                ksize_, stride_, padding_, data_format_,
   1083                                tensor_in, out_shape, propagate_nans_);
   1084     } else {
   1085       Tensor* output = nullptr;
   1086       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
   1087       if (is_int8x4) {
   1088         LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params,
   1089                                                            tensor_in, output);
   1090       } else if (data_format_ == FORMAT_NHWC) {
   1091         LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
   1092                                                   output, propagate_nans_);
   1093       } else {
   1094         LOG(FATAL) << "MaxPool currently only supports the following (layout, "
   1095                       "type) combinations: (NHWC, non-qint8), "
   1096                       "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The "
   1097                       "requested combination ("
   1098                    << ToString(data_format_) << ", "
   1099                    << DataTypeString(DataTypeToEnum<T>::v())
   1100                    << ") is not supported.";
   1101       }
   1102     }
   1103   }
   1104 
   1105  private:
   1106   std::vector<int32> ksize_;
   1107   std::vector<int32> stride_;
   1108   Padding padding_;
   1109   TensorFormat data_format_;
   1110   bool use_dnn_;
   1111   bool propagate_nans_;
   1112 };
   1113 
   1114 template <typename T>
   1115 class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   1116  public:
   1117   typedef GPUDevice Device;
   1118   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
   1119       : OpKernel(context) {
   1120     string data_format;
   1121     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
   1122     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
   1123                 errors::InvalidArgument("Invalid data format"));
   1124     if (context->num_inputs() == 1) {
   1125       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
   1126       OP_REQUIRES(context, ksize_.size() == 4,
   1127                   errors::InvalidArgument("Sliding window ksize field must "
   1128                                           "specify 4 dimensions"));
   1129       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
   1130       OP_REQUIRES(context, stride_.size() == 4,
   1131                   errors::InvalidArgument("Sliding window stride field must "
   1132                                           "specify 4 dimensions"));
   1133       const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
   1134       const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
   1135       OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
   1136                   errors::Unimplemented(
   1137                       "Pooling is not yet supported on the batch dimension."));
   1138     }
   1139     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   1140     use_dnn_ = CanUseCudnn();
   1141     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
   1142                                    &propagate_nans_));
   1143   }
   1144 
   1145   void Compute(OpKernelContext* context) override {
   1146     const Tensor& tensor_in = context->input(0);
   1147 
   1148     std::vector<int32> ksize = ksize_;
   1149     std::vector<int32> stride = stride_;
   1150 
   1151     if (context->num_inputs() != 1) {
   1152       const Tensor& tensor_ksize = context->input(1);
   1153       auto value_ksize = tensor_ksize.flat<int32>();
   1154       ksize.resize(tensor_ksize.shape().num_elements());
   1155       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
   1156 
   1157       const Tensor& tensor_stride = context->input(2);
   1158       auto value_stride = tensor_stride.flat<int32>();
   1159       stride.resize(tensor_stride.shape().num_elements());
   1160       std::copy_n(&value_stride(0), stride.size(), stride.begin());
   1161     }
   1162     OP_REQUIRES(context, ksize.size() == 4,
   1163                 errors::InvalidArgument("Sliding window ksize field must "
   1164                                         "specify 4 dimensions"));
   1165     OP_REQUIRES(context, stride.size() == 4,
   1166                 errors::InvalidArgument("Sliding window stride field must "
   1167                                         "specify 4 dimensions"));
   1168     const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
   1169     const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
   1170     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
   1171                 errors::Unimplemented(
   1172                     "Pooling is not yet supported on the batch dimension."));
   1173 
   1174     PoolParameters params{context,  ksize,        stride,
   1175                           padding_, data_format_, tensor_in.shape()};
   1176     if (!context->status().ok()) {
   1177       return;
   1178     }
   1179 
   1180     TensorShape out_shape =
   1181         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
   1182                         params.out_width, params.depth);
   1183     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
   1184       DnnPoolingOp<T>::Compute(context,
   1185                                perftools::gputools::dnn::PoolingMode::kMaximum,
   1186                                ksize, stride, padding_, data_format_, tensor_in,
   1187                                out_shape, propagate_nans_);
   1188     } else {
   1189       CHECK(data_format_ == FORMAT_NHWC)
   1190           << "Non-Cudnn MaxPool only supports NHWC format";
   1191       Tensor* output = nullptr;
   1192       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
   1193       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
   1194                                                 output, propagate_nans_);
   1195     }
   1196   }
   1197 
   1198  private:
   1199   std::vector<int32> ksize_;
   1200   std::vector<int32> stride_;
   1201   Padding padding_;
   1202   TensorFormat data_format_;
   1203   bool use_dnn_;
   1204   bool propagate_nans_;
   1205 };
   1206 
   1207 template <typename T>
   1208 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   1209   static void launch(OpKernelContext* context, const PoolParameters& params,
   1210                      const Tensor& input, Tensor* output, bool propagate_nans) {
   1211     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
   1212         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
   1213         params.tensor_in_cols, params.depth, params.out_height,
   1214         params.out_width, params.window_rows, params.window_cols,
   1215         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
   1216         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
   1217         propagate_nans);
   1218     if (!status) {
   1219       context->SetStatus(
   1220           errors::Internal("Failed launching MaxPoolForwardNoMask"));
   1221     }
   1222   }
   1223 };
   1224 
   1225 template <typename T>
   1226 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   1227   static void launch(OpKernelContext* context, const PoolParameters& params,
   1228                      const Tensor& input, Tensor* output, Tensor* argmax,
   1229                      bool propagate_nans) {
   1230     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
   1231         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
   1232         params.tensor_in_cols, params.depth, params.out_height,
   1233         params.out_width, params.window_rows, params.window_cols,
   1234         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
   1235         output->flat<T>().data(),
   1236         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
   1237         context->eigen_gpu_device(), propagate_nans);
   1238     if (!status) {
   1239       context->SetStatus(
   1240           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
   1241     }
   1242   }
   1243 };
   1244 
   1245 template <typename T>
   1246 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
   1247   static void launch(OpKernelContext* context, const PoolParameters& params,
   1248                      const Tensor& grad_in, const Tensor& argmax,
   1249                      Tensor* grad_out) {
   1250     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
   1251                            params.tensor_in_cols * params.depth;
   1252     const int output_size = params.tensor_in_batch * params.out_height *
   1253                             params.out_width * params.depth;
   1254     const int top_offset = params.out_height * params.out_width * params.depth;
   1255     const int bottom_offset =
   1256         params.tensor_in_rows * params.tensor_in_cols * params.depth;
   1257     bool status = functor::MaxPoolBackwardWithArgmax<T>()(
   1258         output_size, input_size, grad_in.flat<T>().data(),
   1259         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
   1260         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
   1261     if (!status) {
   1262       context->SetStatus(
   1263           errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
   1264     }
   1265   }
   1266 };
   1267 
   1268 template <typename T>
   1269 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
   1270   static void launch(OpKernelContext* context, const PoolParameters& params,
   1271                      const Tensor& grad_in, const Tensor& argmax,
   1272                      Tensor* grad_out) {
   1273     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
   1274                            params.tensor_in_cols * params.depth;
   1275     const int output_size = params.tensor_in_batch * params.out_height *
   1276                             params.out_width * params.depth;
   1277     const int top_offset =
   1278         params.tensor_in_rows * params.tensor_in_cols * params.depth;
   1279     const int bottom_offset =
   1280         params.out_width * params.out_height * params.depth;
   1281     bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
   1282         output_size, input_size, grad_in.flat<T>().data(),
   1283         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
   1284         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
   1285     if (!status) {
   1286       context->SetStatus(
   1287           errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
   1288     }
   1289   }
   1290 };
   1291 
   1292 #endif  // GOOGLE_CUDA
   1293 
   1294 #define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
   1295   REGISTER_KERNEL_BUILDER(                                               \
   1296       Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"),     \
   1297       MaxPoolingGradOp<D##Device, T>);                                   \
   1298   REGISTER_KERNEL_BUILDER(                                               \
   1299       Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
   1300       MaxPoolingGradGradOp<D##Device, T>);                               \
   1301   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2")                          \
   1302                               .Device(DEVICE_##D)                        \
   1303                               .HostMemory("ksize")                       \
   1304                               .HostMemory("strides")                     \
   1305                               .TypeConstraint<T>("T"),                   \
   1306                           MaxPoolingGradOp<D##Device, T>);               \
   1307   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2")                      \
   1308                               .Device(DEVICE_##D)                        \
   1309                               .HostMemory("ksize")                       \
   1310                               .HostMemory("strides")                     \
   1311                               .TypeConstraint<T>("T"),                   \
   1312                           MaxPoolingGradGradOp<D##Device, T>);
   1313 
   1314 // Below kernels implemented only for CPU device.
   1315 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
   1316   REGISTER_KERNEL_BUILDER(                                         \
   1317       Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
   1318       MaxPoolingOp<CPUDevice, T>);                                 \
   1319   REGISTER_KERNEL_BUILDER(                                         \
   1320       Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
   1321       MaxPoolingV2Op<CPUDevice, T>);
   1322 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
   1323 #undef REGISTER_CPU_ONLY_POOL_KERNELS
   1324 
   1325 #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
   1326 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
   1327 #undef REGISTER_CPU_KERNELS
   1328 
   1329 #if GOOGLE_CUDA
   1330 
   1331 // Forward declarations for the functor specializations for GPU.
   1332 namespace functor {
   1333 #define DECLARE_GPU_SPEC(T)                                            \
   1334   template <>                                                          \
   1335   void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
   1336       const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
   1337       typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
   1338       int window_cols, int row_stride, int col_stride,                 \
   1339       const Eigen::PaddingType& padding);                              \
   1340   extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
   1341 
   1342 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
   1343 #undef DECLARE_GPU_SPEC
   1344 }  // namespace functor
   1345 
   1346 #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
   1347 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
   1348 #undef REGISTER_GPU_MAX_POOL_KERNELS
   1349 
   1350 // Below kernels currently implemented only for GPU device.
   1351 // Note(jiayq): Currently, the Caffe custom implementation is faster than the
   1352 // default Eigen implementation so we are using the custom kernel as the
   1353 // default. However, you can explicitly invoke the eigen version using
   1354 // kernel_label_map.
   1355 #define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
   1356   REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
   1357                               .Device(DEVICE_GPU)                    \
   1358                               .TypeConstraint<T>("T")                \
   1359                               .Label("eigen_tensor"),                \
   1360                           MaxPoolingOp<GPUDevice, T>);               \
   1361   REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
   1362                               .Device(DEVICE_GPU)                    \
   1363                               .HostMemory("ksize")                   \
   1364                               .HostMemory("strides")                 \
   1365                               .TypeConstraint<T>("T")                \
   1366                               .Label("eigen_tensor"),                \
   1367                           MaxPoolingV2Op<GPUDevice, T>);             \
   1368   REGISTER_KERNEL_BUILDER(                                           \
   1369       Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
   1370       MaxPoolingNoMaskOp<GPUDevice, T>);                             \
   1371   REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
   1372                               .Device(DEVICE_GPU)                    \
   1373                               .HostMemory("ksize")                   \
   1374                               .HostMemory("strides")                 \
   1375                               .TypeConstraint<T>("T"),               \
   1376                           MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
   1377   REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
   1378                               .Device(DEVICE_GPU)                    \
   1379                               .TypeConstraint<int64>("Targmax")      \
   1380                               .TypeConstraint<T>("T"),               \
   1381                           MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
   1382   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
   1383                               .Device(DEVICE_GPU)                    \
   1384                               .TypeConstraint<T>("T")                \
   1385                               .TypeConstraint<int64>("Targmax"),     \
   1386                           MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
   1387   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
   1388                               .Device(DEVICE_GPU)                    \
   1389                               .TypeConstraint<T>("T")                \
   1390                               .TypeConstraint<int64>("Targmax"),     \
   1391                           MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
   1392 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
   1393 
   1394 // TODO(b/65847473): Re-enable once the underlying build error is fixed.
   1395 #if !defined(PLATFORM_WINDOWS)
   1396 REGISTER_KERNEL_BUILDER(
   1397     Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
   1398     MaxPoolingNoMaskOp<GPUDevice, qint8>);
   1399 
   1400 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
   1401                             .Device(DEVICE_GPU)
   1402                             .HostMemory("ksize")
   1403                             .HostMemory("strides")
   1404                             .TypeConstraint<qint8>("T"),
   1405                         MaxPoolingV2Op<GPUDevice, qint8>);
   1406 
   1407 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
   1408                             .Device(DEVICE_GPU)
   1409                             .HostMemory("ksize")
   1410                             .HostMemory("strides")
   1411                             .TypeConstraint<qint8>("T")
   1412                             .Label("eigen_tensor"),
   1413                         MaxPoolingV2Op<GPUDevice, qint8>);
   1414 #endif  // !defined(PLATFORM_WINDOWS)
   1415 
   1416 #undef REGISTER_GPU_ONLY_POOL_KERNELS
   1417 
   1418 #endif  // GOOGLE_CUDA
   1419 
   1420 #undef REGISTER_MAX_POOL_KERNELS
   1421 
   1422 }  // namespace tensorflow
   1423