1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/nn_ops.cc. 17 18 #define EIGEN_USE_THREADS 19 20 #include "tensorflow/core/kernels/maxpooling_op.h" 21 22 #include <vector> 23 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 24 #include "tensorflow/core/common_runtime/device.h" 25 #include "tensorflow/core/framework/numeric_op.h" 26 #include "tensorflow/core/framework/op_kernel.h" 27 #include "tensorflow/core/framework/register_types.h" 28 #include "tensorflow/core/framework/tensor.h" 29 #include "tensorflow/core/framework/tensor_shape.h" 30 #include "tensorflow/core/framework/tensor_slice.h" 31 #include "tensorflow/core/kernels/conv_2d.h" 32 #include "tensorflow/core/kernels/eigen_pooling.h" 33 #include "tensorflow/core/kernels/ops_util.h" 34 #include "tensorflow/core/kernels/pooling_ops_common.h" 35 #include "tensorflow/core/lib/core/errors.h" 36 #include "tensorflow/core/lib/gtl/array_slice.h" 37 #include "tensorflow/core/util/env_var.h" 38 #include "tensorflow/core/util/padding.h" 39 #include "tensorflow/core/util/tensor_format.h" 40 #include "tensorflow/core/util/use_cudnn.h" 41 42 #if GOOGLE_CUDA 43 #include "tensorflow/core/kernels/maxpooling_op_gpu.h" 44 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h" 45 #include "tensorflow/core/platform/stream_executor.h" 46 #endif // GOOGLE_CUDA 47 48 namespace tensorflow { 49 50 typedef Eigen::ThreadPoolDevice CPUDevice; 51 typedef Eigen::GpuDevice GPUDevice; 52 53 const int kInvalidMaxPoolingIndex = -1; 54 55 template <typename Device, typename T> 56 static void SpatialMaxPoolWithArgMaxHelper( 57 OpKernelContext* context, Tensor* output, Tensor* output_arg_max, 58 Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop, 59 const PoolParameters& params, const Padding& padding) { 60 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 61 ConstEigenMatrixMap; 62 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 63 EigenMatrixMap; 64 typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>> 65 EigenIndexMatrixMap; 66 67 ConstEigenMatrixMap in_mat( 68 tensor_in.flat<T>().data(), params.depth, 69 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); 70 EigenMatrixMap out_mat( 71 output->flat<T>().data(), params.depth, 72 params.out_width * params.out_height * params.tensor_in_batch); 73 EigenIndexMatrixMap out_arg_max_mat( 74 output_arg_max->flat<int64>().data(), params.depth, 75 params.out_width * params.out_height * params.tensor_in_batch); 76 77 const DeviceBase::CpuWorkerThreads& worker_threads = 78 *(context->device()->tensorflow_cpu_worker_threads()); 79 80 // The following code basically does the following: 81 // 1. Flattens the input and output tensors into two dimensional arrays. 82 // tensor_in_as_matrix: 83 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 84 // output_as_matrix: 85 // depth by (out_width * out_height * tensor_in_batch) 86 // 87 // 2. Walks through the set of columns in the flattened tensor_in_as_matrix, 88 // and updates the corresponding column(s) in output_as_matrix with the 89 // max value. 90 auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, 91 &output_arg_max, &out_backprop](int64 start, int64 limit) { 92 const int32 depth = params.depth; 93 const int32 in_rows = params.tensor_in_rows; 94 const int32 in_cols = params.tensor_in_cols; 95 const int32 pad_rows = params.pad_rows; 96 const int32 pad_cols = params.pad_cols; 97 const int32 window_rows = params.window_rows; 98 const int32 window_cols = params.window_cols; 99 const int32 row_stride = params.row_stride; 100 const int32 col_stride = params.col_stride; 101 const int32 out_height = params.out_height; 102 const int32 out_width = params.out_width; 103 104 { 105 // Initializes the output tensor with MIN<T>. 106 const int32 output_image_size = out_height * out_width * depth; 107 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1, 108 (limit - start) * output_image_size); 109 out_shard.setConstant(Eigen::NumTraits<T>::lowest()); 110 EigenIndexMatrixMap out_arg_max_shard( 111 out_arg_max_mat.data() + start * output_image_size, 1, 112 (limit - start) * output_image_size); 113 out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex); 114 } 115 116 for (int64 b = start; b < limit; ++b) { 117 for (int h = 0; h < in_rows; ++h) { 118 for (int w = 0; w < in_cols; ++w) { 119 // (h_start, h_end) * (w_start, w_end) is the range that the input 120 // vector projects to. 121 const int hpad = h + pad_rows; 122 const int wpad = w + pad_cols; 123 const int h_start = 124 (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1; 125 const int h_end = std::min(hpad / row_stride + 1, out_height); 126 const int w_start = 127 (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1; 128 const int w_end = std::min(wpad / col_stride + 1, out_width); 129 // compute elementwise max 130 const int64 in_index = (b * in_rows + h) * in_cols + w; 131 for (int ph = h_start; ph < h_end; ++ph) { 132 const int64 out_index_base = (b * out_height + ph) * out_width; 133 for (int pw = w_start; pw < w_end; ++pw) { 134 const int64 out_index = out_index_base + pw; 135 /// NOTES(zhengxq): not using the eigen matrix operation for 136 /// now. 137 for (int d = 0; d < depth; ++d) { 138 const T& input_ref = in_mat.coeffRef(d, in_index); 139 T& output_ref = out_mat.coeffRef(d, out_index); 140 int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index); 141 if (output_ref < input_ref || 142 out_arg_max_ref == kInvalidMaxPoolingIndex) { 143 output_ref = input_ref; 144 int64 input_offset = in_index * depth + d; 145 out_arg_max_ref = input_offset; 146 } 147 } 148 } 149 } 150 } 151 } 152 } 153 154 { 155 auto input_backprop_flat = input_backprop->flat<T>(); 156 auto out_arg_max_flat = output_arg_max->flat<int64>(); 157 auto out_backprop_flat = out_backprop.flat<T>(); 158 159 // Initialize output to 0. 160 const int64 in_size = in_rows * in_cols * depth; 161 const int64 in_start = start * in_size; 162 const int64 in_end = limit * in_size; 163 EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1, 164 in_end - in_start); 165 in_shard.setConstant(T(0)); 166 167 // Backpropagate. 168 const int out_size = out_height * out_width * depth; 169 const int out_start = start * out_size; 170 const int out_end = limit * out_size; 171 for (int index = out_start; index < out_end; ++index) { 172 int input_backprop_index = out_arg_max_flat(index); 173 // Although this check is in the inner loop, it is worth its value 174 // so we don't end up with memory corruptions. Our benchmark shows that 175 // the performance impact is quite small 176 CHECK(input_backprop_index >= in_start && input_backprop_index < in_end) 177 << "Invalid input backprop index: " << input_backprop_index << ", " 178 << in_start << ", " << in_end; 179 input_backprop_flat(input_backprop_index) += out_backprop_flat(index); 180 } 181 } 182 }; 183 184 const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols * 185 params.depth * params.window_rows * 186 params.window_cols; 187 Shard(worker_threads.num_threads, worker_threads.workers, 188 params.tensor_in_batch, shard_cost, shard); 189 } 190 191 // The operation to compute MaxPool gradients. 192 // It takes three inputs: 193 // - The original input tensor 194 // - The original output tensor 195 // - Backprop tensor for output 196 // It produces one output: backprop tensor for input. 197 template <class Device, class T> 198 class MaxPoolingGradOp : public OpKernel { 199 public: 200 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { 201 string data_format; 202 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 203 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 204 errors::InvalidArgument("Invalid data format")); 205 OP_REQUIRES( 206 context, data_format_ == FORMAT_NHWC, 207 errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ", 208 "on device type ", 209 DeviceTypeString(context->device_type()))); 210 211 if (context->num_inputs() == 3) { 212 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 213 OP_REQUIRES(context, ksize_.size() == 4, 214 errors::InvalidArgument("Sliding window ksize field must " 215 "specify 4 dimensions")); 216 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 217 OP_REQUIRES(context, stride_.size() == 4, 218 errors::InvalidArgument("Sliding window strides field must " 219 "specify 4 dimensions")); 220 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 221 errors::Unimplemented( 222 "Pooling is not yet supported on the batch dimension.")); 223 OP_REQUIRES( 224 context, ksize_[3] == 1 && stride_[3] == 1, 225 errors::Unimplemented( 226 "MaxPoolingGrad is not yet supported on the depth dimension.")); 227 } 228 229 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 230 } 231 232 void Compute(OpKernelContext* context) override { 233 const Tensor& tensor_in = context->input(0); 234 const Tensor& tensor_out = context->input(1); 235 const Tensor& out_backprop = context->input(2); 236 237 // For maxpooling, tensor_in should have 4 dimensions. 238 OP_REQUIRES(context, tensor_in.dims() == 4, 239 errors::InvalidArgument("tensor_in must be 4-dimensional")); 240 OP_REQUIRES(context, tensor_out.dims() == 4, 241 errors::InvalidArgument("tensor_out must be 4-dimensional")); 242 // For maxpooling, out_backprop should have 4 dimensions. 243 OP_REQUIRES(context, out_backprop.dims() == 4, 244 errors::InvalidArgument("out_backprop must be 4-dimensional")); 245 246 const TensorShape& output_shape = tensor_in.shape(); 247 248 Tensor tensor_out_dup; 249 OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp( 250 {1}, DataTypeToEnum<T>::v(), tensor_out.shape(), 251 &tensor_out_dup)); 252 Tensor tensor_out_arg_max; 253 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(), 254 tensor_out.shape(), 255 &tensor_out_arg_max)); 256 std::vector<int32> ksize = ksize_; 257 std::vector<int32> stride = stride_; 258 if (context->num_inputs() == 5) { 259 const Tensor& tensor_ksize = context->input(3); 260 auto value_ksize = tensor_ksize.flat<int32>(); 261 ksize.resize(tensor_ksize.shape().num_elements()); 262 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 263 264 const Tensor& tensor_stride = context->input(4); 265 auto value_stride = tensor_stride.flat<int32>(); 266 stride.resize(tensor_stride.shape().num_elements()); 267 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 268 } 269 270 OP_REQUIRES(context, ksize.size() == 4, 271 errors::InvalidArgument("Sliding window ksize field must " 272 "specify 4 dimensions")); 273 OP_REQUIRES(context, stride.size() == 4, 274 errors::InvalidArgument("Sliding window strides field must " 275 "specify 4 dimensions")); 276 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, 277 errors::Unimplemented( 278 "Pooling is not yet supported on the batch dimension.")); 279 OP_REQUIRES( 280 context, ksize[3] == 1 && stride[3] == 1, 281 errors::Unimplemented( 282 "MaxPoolingGrad is not yet supported on the depth dimension.")); 283 284 PoolParameters params{context, ksize, stride, 285 padding_, FORMAT_NHWC, tensor_in.shape()}; 286 if (!context->status().ok()) { 287 return; 288 } 289 290 Tensor* output = nullptr; 291 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 292 {0}, 0, output_shape, &output)); 293 294 SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>( 295 context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in, 296 out_backprop, params, padding_); 297 } 298 299 private: 300 std::vector<int32> ksize_; 301 std::vector<int32> stride_; 302 Padding padding_; 303 TensorFormat data_format_; 304 }; 305 306 #ifdef GOOGLE_CUDA 307 308 template <typename T> 309 static void MaxPoolingBackwardCustomKernel( 310 OpKernelContext* context, const std::vector<int32>& size, 311 const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in, 312 const Tensor& out_backprop, const TensorShape& tensor_in_shape) { 313 Tensor* output = nullptr; 314 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 315 {0}, 0, tensor_in_shape, &output)); 316 317 PoolParameters params{context, size, stride, 318 padding, FORMAT_NHWC, tensor_in_shape}; 319 if (!context->status().ok()) { 320 return; 321 } 322 323 functor::MaxPoolBackwardNoMask<T>()( 324 tensor_in->flat<T>().data(), params.tensor_in_batch, 325 params.tensor_in_rows, params.tensor_in_cols, params.depth, 326 params.out_height, params.out_width, params.window_rows, 327 params.window_cols, params.row_stride, params.col_stride, params.pad_rows, 328 params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(), 329 context->eigen_device<Eigen::GpuDevice>()); 330 } 331 332 template <class T> 333 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel { 334 public: 335 typedef Eigen::GpuDevice Device; 336 337 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { 338 string data_format; 339 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 340 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 341 errors::InvalidArgument("Invalid data format")); 342 if (context->num_inputs() == 3) { 343 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 344 OP_REQUIRES(context, ksize_.size() == 4, 345 errors::InvalidArgument("Sliding window ksize field must " 346 "specify 4 dimensions")); 347 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 348 OP_REQUIRES(context, stride_.size() == 4, 349 errors::InvalidArgument("Sliding window strides field must " 350 "specify 4 dimensions")); 351 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); 352 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); 353 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 354 errors::Unimplemented( 355 "Pooling is not yet supported on the batch dimension.")); 356 } 357 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 358 359 use_dnn_ = CanUseCudnn(); 360 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, 361 &propagate_nans_)); 362 } 363 364 void Compute(OpKernelContext* context) override { 365 const Tensor& tensor_in = context->input(0); 366 const Tensor& tensor_out = context->input(1); 367 const Tensor& out_backprop = context->input(2); 368 369 // For maxpooling, tensor_in should have 4 dimensions. 370 OP_REQUIRES(context, tensor_in.dims() == 4, 371 errors::InvalidArgument("tensor_in must be 4-dimensional 4")); 372 OP_REQUIRES(context, tensor_out.dims() == 4, 373 errors::InvalidArgument("tensor_out must be 4-dimensional")); 374 // For maxpooling, out_backprop should have 4 dimensions. 375 OP_REQUIRES(context, out_backprop.dims() == 4, 376 errors::InvalidArgument("out_backprop must be 4-dimensional")); 377 378 TensorShape output_shape = tensor_in.shape(); 379 380 std::vector<int32> ksize = ksize_; 381 std::vector<int32> stride = stride_; 382 if (context->num_inputs() == 5) { 383 const Tensor& tensor_ksize = context->input(3); 384 auto value_ksize = tensor_ksize.flat<int32>(); 385 ksize.resize(tensor_ksize.shape().num_elements()); 386 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 387 388 const Tensor& tensor_stride = context->input(4); 389 auto value_stride = tensor_stride.flat<int32>(); 390 stride.resize(tensor_stride.shape().num_elements()); 391 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 392 } 393 OP_REQUIRES(context, ksize.size() == 4, 394 errors::InvalidArgument("Sliding window ksize field must " 395 "specify 4 dimensions")); 396 OP_REQUIRES(context, stride.size() == 4, 397 errors::InvalidArgument("Sliding window strides field must " 398 "specify 4 dimensions")); 399 const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N'); 400 const int32 stride_n = GetTensorDim(stride, data_format_, 'N'); 401 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 402 errors::Unimplemented( 403 "Pooling is not yet supported on the batch dimension.")); 404 405 if (use_dnn_) { 406 DnnPoolingGradOp<T>::Compute( 407 context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize, 408 stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop, 409 output_shape, propagate_nans_); 410 } else { 411 CHECK(data_format_ == FORMAT_NHWC) 412 << "Non-Cudnn MaxPoolGrad only supports NHWC format"; 413 MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_, 414 &tensor_in, out_backprop, output_shape); 415 } 416 } 417 418 private: 419 std::vector<int32> ksize_; 420 std::vector<int32> stride_; 421 Padding padding_; 422 TensorFormat data_format_; 423 bool use_dnn_; 424 bool propagate_nans_; 425 }; 426 427 #endif // GOOGLE_CUDA 428 429 // The operation to compute gradient of MaxPool gradients. 430 // It takes three inputs: 431 // - The original input tensor 432 // - The original output tensor 433 // - Backprop tensor for output gradients 434 // It produces one output: backprop tensor for output gradient. 435 template <class Device, class T> 436 class MaxPoolingGradGradOp : public OpKernel { 437 public: 438 explicit MaxPoolingGradGradOp(OpKernelConstruction* context) 439 : OpKernel(context) { 440 string data_format; 441 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 442 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 443 errors::InvalidArgument("Invalid data format")); 444 OP_REQUIRES( 445 context, data_format_ == FORMAT_NHWC, 446 errors::InvalidArgument( 447 "Default MaxPoolingGradGradOp only supports NHWC ", 448 "on device type ", DeviceTypeString(context->device_type()))); 449 450 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 451 452 if (context->num_inputs() == 3) { 453 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 454 OP_REQUIRES(context, ksize_.size() == 4, 455 errors::InvalidArgument("Sliding window ksize field must " 456 "specify 4 dimensions")); 457 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 458 OP_REQUIRES(context, stride_.size() == 4, 459 errors::InvalidArgument("Sliding window strides field must " 460 "specify 4 dimensions")); 461 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 462 errors::Unimplemented( 463 "Pooling is not yet supported on the batch dimension.")); 464 OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1, 465 errors::Unimplemented("MaxPoolingGradGrad is not yet " 466 "supported on the depth dimension.")); 467 } 468 } 469 470 void Compute(OpKernelContext* context) override { 471 const Tensor& tensor_in = context->input(0); 472 const Tensor& tensor_out = context->input(1); 473 const Tensor& out_grad_backprop = context->input(2); 474 475 // For maxpooling, tensor_in should have 4 dimensions. 476 OP_REQUIRES(context, tensor_in.dims() == 4, 477 errors::InvalidArgument("tensor_in must be 4-dimensional")); 478 OP_REQUIRES(context, tensor_out.dims() == 4, 479 errors::InvalidArgument("tensor_out must be 4-dimensional")); 480 // For maxpooling, out_grad_backprop should have 4 dimensions. 481 OP_REQUIRES( 482 context, out_grad_backprop.dims() == 4, 483 errors::InvalidArgument("out_grad_backprop must be 4-dimensional")); 484 485 std::vector<int32> ksize = ksize_; 486 std::vector<int32> stride = stride_; 487 if (context->num_inputs() == 5) { 488 const Tensor& tensor_ksize = context->input(3); 489 auto value_ksize = tensor_ksize.flat<int32>(); 490 ksize.resize(tensor_ksize.shape().num_elements()); 491 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 492 493 const Tensor& tensor_stride = context->input(4); 494 auto value_stride = tensor_stride.flat<int32>(); 495 stride.resize(tensor_stride.shape().num_elements()); 496 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 497 } 498 499 OP_REQUIRES(context, ksize.size() == 4, 500 errors::InvalidArgument("Sliding window ksize field must " 501 "specify 4 dimensions")); 502 OP_REQUIRES(context, stride.size() == 4, 503 errors::InvalidArgument("Sliding window strides field must " 504 "specify 4 dimensions")); 505 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, 506 errors::Unimplemented( 507 "Pooling is not yet supported on the batch dimension.")); 508 OP_REQUIRES( 509 context, ksize[3] == 1 && stride[3] == 1, 510 errors::Unimplemented( 511 "MaxPoolingGrad is not yet supported on the depth dimension.")); 512 513 PoolParameters params{context, ksize, stride, 514 padding_, FORMAT_NHWC, tensor_in.shape()}; 515 Tensor* output = nullptr; 516 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 517 {2}, 0, tensor_out.shape(), &output)); 518 519 SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out, 520 out_grad_backprop, params, padding_); 521 } 522 523 private: 524 void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff, 525 const Tensor& tensor_in, const Tensor& tensor_out, 526 const Tensor& top_diff, 527 const PoolParameters& params, 528 const Padding& padding) { 529 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 530 ConstEigenMatrixMap; 531 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 532 EigenMatrixMap; 533 534 ConstEigenMatrixMap in_mat( 535 tensor_in.flat<T>().data(), params.depth, 536 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); 537 ConstEigenMatrixMap out_mat( 538 tensor_out.flat<T>().data(), params.depth, 539 params.out_width * params.out_height * params.tensor_in_batch); 540 ConstEigenMatrixMap top_diff_mat( 541 top_diff.flat<T>().data(), params.depth, 542 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); 543 EigenMatrixMap bottom_diff_mat( 544 bottom_diff->flat<T>().data(), params.depth, 545 params.out_width * params.out_height * params.tensor_in_batch); 546 547 const DeviceBase::CpuWorkerThreads& worker_threads = 548 *(context->device()->tensorflow_cpu_worker_threads()); 549 550 // The following code basically does the following: 551 // 1. Flattens the input, output, top_diff and bottom_diff tensors into 552 // two dimensional arrays. 553 // tensor_in_as_matrix: 554 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 555 // tensor_out_as_matrix: 556 // depth by (out_width * out_height * tensor_in_batch) 557 // top_diff_as_matrix: 558 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 559 // bottom_diff_as_matrix: 560 // depth by (out_width * out_height * tensor_in_batch) 561 // 562 // 2. Walks through the set of columns in the flattened 563 // tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix 564 // and updates the column(s) corresponding to the maximum values in 565 // tensor_out_as_matrix with the corresponding values in 566 // top_diff_as_matrix. 567 auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat]( 568 int64 start, int64 limit) { 569 const int32 depth = params.depth; 570 const int32 in_rows = params.tensor_in_rows; 571 const int32 in_cols = params.tensor_in_cols; 572 const int32 pad_rows = params.pad_rows; 573 const int32 pad_cols = params.pad_cols; 574 const int32 window_rows = params.window_rows; 575 const int32 window_cols = params.window_cols; 576 const int32 row_stride = params.row_stride; 577 const int32 col_stride = params.col_stride; 578 const int32 out_height = params.out_height; 579 const int32 out_width = params.out_width; 580 581 { 582 // Initializes the output grad backprop tensor with 0. 583 const int32 output_image_size = out_height * out_width * params.depth; 584 EigenMatrixMap bottom_diff_shard( 585 bottom_diff_mat.data() + start * output_image_size, 1, 586 (limit - start) * output_image_size); 587 bottom_diff_shard.setZero(); 588 } 589 590 for (int b = start; b < limit; ++b) { 591 for (int ph = 0; ph < out_height; ++ph) { 592 for (int pw = 0; pw < out_width; ++pw) { 593 // (h_start, h_end) * (w_start, w_end) is the range that the input 594 // vector projects to. 595 int h_start = ph * row_stride - pad_rows; 596 const int h_end = std::min(h_start + window_rows, in_rows); 597 int w_start = pw * col_stride - pad_cols; 598 const int w_end = std::min(w_start + window_cols, in_cols); 599 h_start = std::max(h_start, 0); 600 w_start = std::max(w_start, 0); 601 const int out_index = (b * out_height + ph) * out_width + pw; 602 // Find value corresponding to the input maximum in top_diff. 603 for (int d = 0; d < depth; ++d) { 604 const T& output_ref = out_mat.coeffRef(d, out_index); 605 bool should_stop = false; 606 for (int h = h_start; h < h_end && !should_stop; ++h) { 607 for (int w = w_start; w < w_end && !should_stop; ++w) { 608 const int in_index = (b * in_rows + h) * in_cols + w; 609 const T& input_ref = in_mat.coeffRef(d, in_index); 610 if (output_ref == input_ref) { 611 T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index); 612 bottom_diff_ref = top_diff_mat.coeffRef(d, in_index); 613 should_stop = true; 614 } 615 } 616 } 617 } 618 } 619 } 620 } 621 }; 622 623 const int64 shard_cost = params.out_width * params.out_height * 624 params.depth * params.window_rows * 625 params.window_cols; 626 Shard(worker_threads.num_threads, worker_threads.workers, 627 params.tensor_in_batch, shard_cost, shard); 628 } 629 630 std::vector<int32> ksize_; 631 std::vector<int32> stride_; 632 Padding padding_; 633 TensorFormat data_format_; 634 }; 635 636 #ifdef GOOGLE_CUDA 637 638 template <class T> 639 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel { 640 public: 641 typedef Eigen::GpuDevice Device; 642 643 explicit MaxPoolingGradGradOp(OpKernelConstruction* context) 644 : OpKernel(context) { 645 string data_format; 646 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 647 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 648 errors::InvalidArgument("Invalid data format")); 649 if (context->num_inputs() == 3) { 650 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 651 OP_REQUIRES(context, ksize_.size() == 4, 652 errors::InvalidArgument("Sliding window ksize field must " 653 "specify 4 dimensions")); 654 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 655 OP_REQUIRES(context, stride_.size() == 4, 656 errors::InvalidArgument("Sliding window strides field must " 657 "specify 4 dimensions")); 658 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); 659 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); 660 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 661 errors::Unimplemented( 662 "Pooling is not yet supported on the batch dimension.")); 663 } 664 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 665 } 666 667 void Compute(OpKernelContext* context) override { 668 const Tensor& tensor_in = context->input(0); 669 const Tensor& tensor_out = context->input(1); 670 const Tensor& out_grad_backprop = context->input(2); 671 672 // For maxpooling, tensor_in should have 4 dimensions. 673 OP_REQUIRES(context, tensor_in.dims() == 4, 674 errors::InvalidArgument("tensor_in must be 4-dimensional 4")); 675 OP_REQUIRES(context, tensor_out.dims() == 4, 676 errors::InvalidArgument("tensor_out must be 4-dimensional")); 677 // For maxpooling, out_grad_backprop should have 4 dimensions. 678 OP_REQUIRES( 679 context, out_grad_backprop.dims() == 4, 680 errors::InvalidArgument("out_grad_backprop must be 4-dimensional")); 681 682 Tensor* output = nullptr; 683 OP_REQUIRES_OK(context, 684 context->allocate_output(0, tensor_out.shape(), &output)); 685 686 std::vector<int32> ksize = ksize_; 687 std::vector<int32> stride = stride_; 688 if (context->num_inputs() == 5) { 689 const Tensor& tensor_ksize = context->input(3); 690 auto value_ksize = tensor_ksize.flat<int32>(); 691 ksize.resize(tensor_ksize.shape().num_elements()); 692 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 693 694 const Tensor& tensor_stride = context->input(4); 695 auto value_stride = tensor_stride.flat<int32>(); 696 stride.resize(tensor_stride.shape().num_elements()); 697 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 698 } 699 700 OP_REQUIRES(context, ksize.size() == 4, 701 errors::InvalidArgument("Sliding window ksize field must " 702 "specify 4 dimensions")); 703 OP_REQUIRES(context, stride.size() == 4, 704 errors::InvalidArgument("Sliding window strides field must " 705 "specify 4 dimensions")); 706 const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N'); 707 const int32 stride_n = GetTensorDim(stride, data_format_, 'N'); 708 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 709 errors::Unimplemented( 710 "Pooling is not yet supported on the batch dimension.")); 711 712 PoolParameters params{context, ksize, stride, 713 padding_, data_format_, tensor_in.shape()}; 714 715 functor::MaxPoolGradBackwardNoMask<T>()( 716 data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(), 717 params.tensor_in_batch, params.out_height, params.out_width, 718 params.depth, params.tensor_in_rows, params.tensor_in_cols, 719 params.window_rows, params.window_cols, params.row_stride, 720 params.col_stride, params.pad_rows, params.pad_cols, 721 out_grad_backprop.flat<T>().data(), output->flat<T>().data(), 722 context->eigen_device<Eigen::GpuDevice>()); 723 } 724 725 private: 726 std::vector<int32> ksize_; 727 std::vector<int32> stride_; 728 Padding padding_; 729 TensorFormat data_format_; 730 bool use_dnn_; 731 }; 732 733 #endif // GOOGLE_CUDA 734 735 template <typename Device, typename T> 736 struct LaunchMaxPoolingNoMask; 737 738 template <typename Device, typename T> 739 class MaxPoolingNoMaskOp : public OpKernel { 740 public: 741 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) 742 : OpKernel(context) { 743 string data_format; 744 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 745 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 746 errors::InvalidArgument("Invalid data format")); 747 OP_REQUIRES( 748 context, data_format_ == FORMAT_NHWC, 749 errors::InvalidArgument( 750 "Default MaxPoolingNoMaskOp only supports NHWC on device type ", 751 DeviceTypeString(context->device_type()))); 752 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 753 OP_REQUIRES(context, ksize_.size() == 4, 754 errors::InvalidArgument("Sliding window ksize field must " 755 "specify 4 dimensions")); 756 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 757 OP_REQUIRES(context, stride_.size() == 4, 758 errors::InvalidArgument("Sliding window stride field must " 759 "specify 4 dimensions")); 760 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 761 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 762 errors::Unimplemented( 763 "Pooling is not yet supported on the batch dimension.")); 764 } 765 766 void Compute(OpKernelContext* context) override { 767 const Tensor& tensor_in = context->input(0); 768 769 PoolParameters params{context, ksize_, stride_, 770 padding_, data_format_, tensor_in.shape()}; 771 if (!context->status().ok()) { 772 return; 773 } 774 775 TensorShape out_shape({params.tensor_in_batch, params.out_height, 776 params.out_width, params.depth}); 777 Tensor* output = nullptr; 778 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 779 780 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, 781 output); 782 } 783 784 private: 785 std::vector<int32> ksize_; 786 std::vector<int32> stride_; 787 Padding padding_; 788 TensorFormat data_format_; 789 }; 790 791 template <typename Device, typename T> 792 class MaxPoolingNoMaskV2Op : public OpKernel { 793 public: 794 explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context) 795 : OpKernel(context) { 796 string data_format; 797 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 798 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 799 errors::InvalidArgument("Invalid data format")); 800 OP_REQUIRES( 801 context, data_format_ == FORMAT_NHWC, 802 errors::InvalidArgument( 803 "Default MaxPoolingNoMaskOp only supports NHWC on device type ", 804 DeviceTypeString(context->device_type()))); 805 if (context->num_inputs() == 1) { 806 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 807 OP_REQUIRES(context, ksize_.size() == 4, 808 errors::InvalidArgument("Sliding window ksize field must " 809 "specify 4 dimensions")); 810 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 811 OP_REQUIRES(context, stride_.size() == 4, 812 errors::InvalidArgument("Sliding window stride field must " 813 "specify 4 dimensions")); 814 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 815 errors::Unimplemented( 816 "Pooling is not yet supported on the batch dimension.")); 817 } 818 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 819 } 820 821 void Compute(OpKernelContext* context) override { 822 const Tensor& tensor_in = context->input(0); 823 824 std::vector<int32> ksize = ksize_; 825 std::vector<int32> stride = stride_; 826 827 if (context->num_inputs() != 1) { 828 const Tensor& tensor_ksize = context->input(1); 829 auto value_ksize = tensor_ksize.flat<int32>(); 830 ksize.resize(tensor_ksize.shape().num_elements()); 831 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 832 833 const Tensor& tensor_stride = context->input(2); 834 auto value_stride = tensor_stride.flat<int32>(); 835 stride.resize(tensor_stride.shape().num_elements()); 836 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 837 } 838 OP_REQUIRES(context, ksize.size() == 4, 839 errors::InvalidArgument("Sliding window ksize field must " 840 "specify 4 dimensions")); 841 OP_REQUIRES(context, stride.size() == 4, 842 errors::InvalidArgument("Sliding window stride field must " 843 "specify 4 dimensions")); 844 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, 845 errors::Unimplemented( 846 "Pooling is not yet supported on the batch dimension.")); 847 PoolParameters params{context, ksize, stride, 848 padding_, data_format_, tensor_in.shape()}; 849 if (!context->status().ok()) { 850 return; 851 } 852 853 TensorShape out_shape({params.tensor_in_batch, params.out_height, 854 params.out_width, params.depth}); 855 Tensor* output = nullptr; 856 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 857 858 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, 859 output); 860 } 861 862 private: 863 std::vector<int32> ksize_; 864 std::vector<int32> stride_; 865 Padding padding_; 866 TensorFormat data_format_; 867 }; 868 869 template <typename Device, typename T> 870 struct LaunchMaxPoolingWithArgmax; 871 872 template <typename Device, typename T> 873 class MaxPoolingWithArgmaxOp : public OpKernel { 874 public: 875 explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context) 876 : OpKernel(context) { 877 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 878 OP_REQUIRES(context, ksize_.size() == 4, 879 errors::InvalidArgument("Sliding window ksize field must " 880 "specify 4 dimensions")); 881 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 882 OP_REQUIRES(context, stride_.size() == 4, 883 errors::InvalidArgument("Sliding window stride field must " 884 "specify 4 dimensions")); 885 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 886 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 887 errors::Unimplemented( 888 "Pooling is not yet supported on the batch dimension.")); 889 890 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, 891 &propagate_nans_)); 892 } 893 894 void Compute(OpKernelContext* context) override { 895 const Tensor& tensor_in = context->input(0); 896 897 PoolParameters params{context, ksize_, stride_, 898 padding_, FORMAT_NHWC, tensor_in.shape()}; 899 if (!context->status().ok()) { 900 return; 901 } 902 903 TensorShape out_shape({params.tensor_in_batch, params.out_height, 904 params.out_width, params.depth}); 905 Tensor* output = nullptr; 906 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 907 Tensor* argmax = nullptr; 908 OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax)); 909 910 LaunchMaxPoolingWithArgmax<Device, T>::launch( 911 context, params, tensor_in, output, argmax, propagate_nans_); 912 } 913 914 private: 915 std::vector<int32> ksize_; 916 std::vector<int32> stride_; 917 Padding padding_; 918 bool propagate_nans_; 919 }; 920 921 template <typename Device, typename T> 922 struct LaunchMaxPoolingGradWithArgmax; 923 924 template <typename Device, typename T> 925 class MaxPoolingGradWithArgmaxOp : public OpKernel { 926 public: 927 explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context) 928 : OpKernel(context) { 929 string data_format_str; 930 auto status = context->GetAttr("data_format", &data_format_str); 931 if (status.ok()) { 932 OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_), 933 errors::InvalidArgument("Invalid data format")); 934 } 935 936 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 937 OP_REQUIRES(context, ksize_.size() == 4, 938 errors::InvalidArgument("Sliding window ksize field must " 939 "specify 4 dimensions")); 940 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 941 OP_REQUIRES(context, stride_.size() == 4, 942 errors::InvalidArgument("Sliding window stride field must " 943 "specify 4 dimensions")); 944 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 945 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 946 errors::Unimplemented( 947 "Pooling is not yet supported on the batch dimension.")); 948 } 949 950 void Compute(OpKernelContext* context) override { 951 const Tensor& tensor_in = context->input(0); 952 const Tensor& grad_in = context->input(1); 953 const Tensor& argmax = context->input(2); 954 955 PoolParameters params{context, ksize_, stride_, 956 padding_, FORMAT_NHWC, tensor_in.shape()}; 957 if (!context->status().ok()) { 958 return; 959 } 960 961 TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows, 962 params.tensor_in_cols, params.depth}); 963 Tensor* grad_out = nullptr; 964 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 965 {1}, 0, out_shape, &grad_out)); 966 967 LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in, 968 argmax, grad_out); 969 } 970 971 private: 972 std::vector<int32> ksize_; 973 std::vector<int32> stride_; 974 Padding padding_; 975 TensorFormat data_format_; 976 }; 977 978 template <typename Device, typename T> 979 struct LaunchMaxPoolingGradGradWithArgmax; 980 981 template <typename Device, typename T> 982 class MaxPoolingGradGradWithArgmaxOp : public OpKernel { 983 public: 984 explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context) 985 : OpKernel(context) { 986 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 987 OP_REQUIRES(context, ksize_.size() == 4, 988 errors::InvalidArgument("Sliding window ksize field must " 989 "specify 4 dimensions")); 990 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 991 OP_REQUIRES(context, stride_.size() == 4, 992 errors::InvalidArgument("Sliding window stride field must " 993 "specify 4 dimensions")); 994 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 995 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 996 errors::Unimplemented( 997 "Pooling is not yet supported on the batch dimension.")); 998 } 999 1000 void Compute(OpKernelContext* context) override { 1001 const Tensor& tensor_in = context->input(0); 1002 const Tensor& grad_in = context->input(1); 1003 const Tensor& argmax = context->input(2); 1004 1005 PoolParameters params{context, ksize_, stride_, 1006 padding_, FORMAT_NHWC, tensor_in.shape()}; 1007 if (!context->status().ok()) { 1008 return; 1009 } 1010 1011 TensorShape out_shape({params.tensor_in_batch, params.out_height, 1012 params.out_width, params.depth}); 1013 1014 Tensor* grad_out = nullptr; 1015 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 1016 {1}, 0, out_shape, &grad_out)); 1017 1018 LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch( 1019 context, params, grad_in, argmax, grad_out); 1020 } 1021 1022 private: 1023 std::vector<int32> ksize_; 1024 std::vector<int32> stride_; 1025 Padding padding_; 1026 }; 1027 1028 #if GOOGLE_CUDA 1029 template <typename T> 1030 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel { 1031 public: 1032 typedef GPUDevice Device; 1033 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) 1034 : OpKernel(context) { 1035 string data_format; 1036 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 1037 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 1038 errors::InvalidArgument("Invalid data format")); 1039 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 1040 OP_REQUIRES(context, ksize_.size() == 4, 1041 errors::InvalidArgument("Sliding window ksize field must " 1042 "specify 4 dimensions")); 1043 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 1044 OP_REQUIRES(context, stride_.size() == 4, 1045 errors::InvalidArgument("Sliding window stride field must " 1046 "specify 4 dimensions")); 1047 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 1048 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); 1049 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); 1050 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 1051 errors::Unimplemented( 1052 "Pooling is not yet supported on the batch dimension.")); 1053 use_dnn_ = CanUseCudnn(); 1054 1055 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, 1056 &propagate_nans_)); 1057 } 1058 1059 void Compute(OpKernelContext* context) override { 1060 const Tensor& tensor_in = context->input(0); 1061 1062 PoolParameters params{context, ksize_, stride_, 1063 padding_, data_format_, tensor_in.shape()}; 1064 if (!context->status().ok()) { 1065 return; 1066 } 1067 1068 TensorShape out_shape = 1069 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height, 1070 params.out_width, params.depth); 1071 1072 // Assuming qint8 <--> NCHW_VECT_C (int8x4) here. 1073 constexpr bool is_int8x4 = std::is_same<T, qint8>::value; 1074 OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)), 1075 errors::InvalidArgument( 1076 "qint8 should be used with data_format NCHW_VECT_C.")); 1077 1078 // These is_int8x4 checks avoid linker errors for missing qint8 kernels. 1079 if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) { 1080 DnnPoolingOp<T>::Compute(context, 1081 perftools::gputools::dnn::PoolingMode::kMaximum, 1082 ksize_, stride_, padding_, data_format_, 1083 tensor_in, out_shape, propagate_nans_); 1084 } else { 1085 Tensor* output = nullptr; 1086 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 1087 if (is_int8x4) { 1088 LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params, 1089 tensor_in, output); 1090 } else if (data_format_ == FORMAT_NHWC) { 1091 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, 1092 output, propagate_nans_); 1093 } else { 1094 LOG(FATAL) << "MaxPool currently only supports the following (layout, " 1095 "type) combinations: (NHWC, non-qint8), " 1096 "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The " 1097 "requested combination (" 1098 << ToString(data_format_) << ", " 1099 << DataTypeString(DataTypeToEnum<T>::v()) 1100 << ") is not supported."; 1101 } 1102 } 1103 } 1104 1105 private: 1106 std::vector<int32> ksize_; 1107 std::vector<int32> stride_; 1108 Padding padding_; 1109 TensorFormat data_format_; 1110 bool use_dnn_; 1111 bool propagate_nans_; 1112 }; 1113 1114 template <typename T> 1115 class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel { 1116 public: 1117 typedef GPUDevice Device; 1118 explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context) 1119 : OpKernel(context) { 1120 string data_format; 1121 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 1122 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 1123 errors::InvalidArgument("Invalid data format")); 1124 if (context->num_inputs() == 1) { 1125 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 1126 OP_REQUIRES(context, ksize_.size() == 4, 1127 errors::InvalidArgument("Sliding window ksize field must " 1128 "specify 4 dimensions")); 1129 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 1130 OP_REQUIRES(context, stride_.size() == 4, 1131 errors::InvalidArgument("Sliding window stride field must " 1132 "specify 4 dimensions")); 1133 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); 1134 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); 1135 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 1136 errors::Unimplemented( 1137 "Pooling is not yet supported on the batch dimension.")); 1138 } 1139 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 1140 use_dnn_ = CanUseCudnn(); 1141 TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, 1142 &propagate_nans_)); 1143 } 1144 1145 void Compute(OpKernelContext* context) override { 1146 const Tensor& tensor_in = context->input(0); 1147 1148 std::vector<int32> ksize = ksize_; 1149 std::vector<int32> stride = stride_; 1150 1151 if (context->num_inputs() != 1) { 1152 const Tensor& tensor_ksize = context->input(1); 1153 auto value_ksize = tensor_ksize.flat<int32>(); 1154 ksize.resize(tensor_ksize.shape().num_elements()); 1155 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); 1156 1157 const Tensor& tensor_stride = context->input(2); 1158 auto value_stride = tensor_stride.flat<int32>(); 1159 stride.resize(tensor_stride.shape().num_elements()); 1160 std::copy_n(&value_stride(0), stride.size(), stride.begin()); 1161 } 1162 OP_REQUIRES(context, ksize.size() == 4, 1163 errors::InvalidArgument("Sliding window ksize field must " 1164 "specify 4 dimensions")); 1165 OP_REQUIRES(context, stride.size() == 4, 1166 errors::InvalidArgument("Sliding window stride field must " 1167 "specify 4 dimensions")); 1168 const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N'); 1169 const int32 stride_n = GetTensorDim(stride, data_format_, 'N'); 1170 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 1171 errors::Unimplemented( 1172 "Pooling is not yet supported on the batch dimension.")); 1173 1174 PoolParameters params{context, ksize, stride, 1175 padding_, data_format_, tensor_in.shape()}; 1176 if (!context->status().ok()) { 1177 return; 1178 } 1179 1180 TensorShape out_shape = 1181 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height, 1182 params.out_width, params.depth); 1183 if (use_dnn_ && data_format_ == FORMAT_NCHW) { 1184 DnnPoolingOp<T>::Compute(context, 1185 perftools::gputools::dnn::PoolingMode::kMaximum, 1186 ksize, stride, padding_, data_format_, tensor_in, 1187 out_shape, propagate_nans_); 1188 } else { 1189 CHECK(data_format_ == FORMAT_NHWC) 1190 << "Non-Cudnn MaxPool only supports NHWC format"; 1191 Tensor* output = nullptr; 1192 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 1193 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, 1194 output, propagate_nans_); 1195 } 1196 } 1197 1198 private: 1199 std::vector<int32> ksize_; 1200 std::vector<int32> stride_; 1201 Padding padding_; 1202 TensorFormat data_format_; 1203 bool use_dnn_; 1204 bool propagate_nans_; 1205 }; 1206 1207 template <typename T> 1208 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { 1209 static void launch(OpKernelContext* context, const PoolParameters& params, 1210 const Tensor& input, Tensor* output, bool propagate_nans) { 1211 bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()( 1212 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, 1213 params.tensor_in_cols, params.depth, params.out_height, 1214 params.out_width, params.window_rows, params.window_cols, 1215 params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, 1216 output->flat<T>().data(), nullptr, context->eigen_gpu_device(), 1217 propagate_nans); 1218 if (!status) { 1219 context->SetStatus( 1220 errors::Internal("Failed launching MaxPoolForwardNoMask")); 1221 } 1222 } 1223 }; 1224 1225 template <typename T> 1226 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> { 1227 static void launch(OpKernelContext* context, const PoolParameters& params, 1228 const Tensor& input, Tensor* output, Tensor* argmax, 1229 bool propagate_nans) { 1230 bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()( 1231 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, 1232 params.tensor_in_cols, params.depth, params.out_height, 1233 params.out_width, params.window_rows, params.window_cols, 1234 params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, 1235 output->flat<T>().data(), 1236 reinterpret_cast<int64*>(argmax->flat<int64>().data()), 1237 context->eigen_gpu_device(), propagate_nans); 1238 if (!status) { 1239 context->SetStatus( 1240 errors::Internal("Failed launching MaxPoolForwardWithArgmax")); 1241 } 1242 } 1243 }; 1244 1245 template <typename T> 1246 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { 1247 static void launch(OpKernelContext* context, const PoolParameters& params, 1248 const Tensor& grad_in, const Tensor& argmax, 1249 Tensor* grad_out) { 1250 const int input_size = params.tensor_in_batch * params.tensor_in_rows * 1251 params.tensor_in_cols * params.depth; 1252 const int output_size = params.tensor_in_batch * params.out_height * 1253 params.out_width * params.depth; 1254 const int top_offset = params.out_height * params.out_width * params.depth; 1255 const int bottom_offset = 1256 params.tensor_in_rows * params.tensor_in_cols * params.depth; 1257 bool status = functor::MaxPoolBackwardWithArgmax<T>()( 1258 output_size, input_size, grad_in.flat<T>().data(), 1259 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, 1260 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); 1261 if (!status) { 1262 context->SetStatus( 1263 errors::Internal("Failed launching MaxPoolBackwardWithArgmax")); 1264 } 1265 } 1266 }; 1267 1268 template <typename T> 1269 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> { 1270 static void launch(OpKernelContext* context, const PoolParameters& params, 1271 const Tensor& grad_in, const Tensor& argmax, 1272 Tensor* grad_out) { 1273 const int input_size = params.tensor_in_batch * params.tensor_in_rows * 1274 params.tensor_in_cols * params.depth; 1275 const int output_size = params.tensor_in_batch * params.out_height * 1276 params.out_width * params.depth; 1277 const int top_offset = 1278 params.tensor_in_rows * params.tensor_in_cols * params.depth; 1279 const int bottom_offset = 1280 params.out_width * params.out_height * params.depth; 1281 bool status = functor::MaxPoolGradBackwardWithArgmax<T>()( 1282 output_size, input_size, grad_in.flat<T>().data(), 1283 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, 1284 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); 1285 if (!status) { 1286 context->SetStatus( 1287 errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax")); 1288 } 1289 } 1290 }; 1291 1292 #endif // GOOGLE_CUDA 1293 1294 #define REGISTER_MAX_POOL_KERNELS(D, T) \ 1295 REGISTER_KERNEL_BUILDER( \ 1296 Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ 1297 MaxPoolingGradOp<D##Device, T>); \ 1298 REGISTER_KERNEL_BUILDER( \ 1299 Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ 1300 MaxPoolingGradGradOp<D##Device, T>); \ 1301 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2") \ 1302 .Device(DEVICE_##D) \ 1303 .HostMemory("ksize") \ 1304 .HostMemory("strides") \ 1305 .TypeConstraint<T>("T"), \ 1306 MaxPoolingGradOp<D##Device, T>); \ 1307 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2") \ 1308 .Device(DEVICE_##D) \ 1309 .HostMemory("ksize") \ 1310 .HostMemory("strides") \ 1311 .TypeConstraint<T>("T"), \ 1312 MaxPoolingGradGradOp<D##Device, T>); 1313 1314 // Below kernels implemented only for CPU device. 1315 #define REGISTER_CPU_ONLY_POOL_KERNELS(T) \ 1316 REGISTER_KERNEL_BUILDER( \ 1317 Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 1318 MaxPoolingOp<CPUDevice, T>); \ 1319 REGISTER_KERNEL_BUILDER( \ 1320 Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 1321 MaxPoolingV2Op<CPUDevice, T>); 1322 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS); 1323 #undef REGISTER_CPU_ONLY_POOL_KERNELS 1324 1325 #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T); 1326 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS); 1327 #undef REGISTER_CPU_KERNELS 1328 1329 #if GOOGLE_CUDA 1330 1331 // Forward declarations for the functor specializations for GPU. 1332 namespace functor { 1333 #define DECLARE_GPU_SPEC(T) \ 1334 template <> \ 1335 void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ 1336 const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ 1337 typename TTypes<T, 4>::ConstTensor input, int window_rows, \ 1338 int window_cols, int row_stride, int col_stride, \ 1339 const Eigen::PaddingType& padding); \ 1340 extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; 1341 1342 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); 1343 #undef DECLARE_GPU_SPEC 1344 } // namespace functor 1345 1346 #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T) 1347 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS); 1348 #undef REGISTER_GPU_MAX_POOL_KERNELS 1349 1350 // Below kernels currently implemented only for GPU device. 1351 // Note(jiayq): Currently, the Caffe custom implementation is faster than the 1352 // default Eigen implementation so we are using the custom kernel as the 1353 // default. However, you can explicitly invoke the eigen version using 1354 // kernel_label_map. 1355 #define REGISTER_GPU_ONLY_POOL_KERNELS(T) \ 1356 REGISTER_KERNEL_BUILDER(Name("MaxPool") \ 1357 .Device(DEVICE_GPU) \ 1358 .TypeConstraint<T>("T") \ 1359 .Label("eigen_tensor"), \ 1360 MaxPoolingOp<GPUDevice, T>); \ 1361 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \ 1362 .Device(DEVICE_GPU) \ 1363 .HostMemory("ksize") \ 1364 .HostMemory("strides") \ 1365 .TypeConstraint<T>("T") \ 1366 .Label("eigen_tensor"), \ 1367 MaxPoolingV2Op<GPUDevice, T>); \ 1368 REGISTER_KERNEL_BUILDER( \ 1369 Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ 1370 MaxPoolingNoMaskOp<GPUDevice, T>); \ 1371 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \ 1372 .Device(DEVICE_GPU) \ 1373 .HostMemory("ksize") \ 1374 .HostMemory("strides") \ 1375 .TypeConstraint<T>("T"), \ 1376 MaxPoolingNoMaskV2Op<GPUDevice, T>); \ 1377 REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \ 1378 .Device(DEVICE_GPU) \ 1379 .TypeConstraint<int64>("Targmax") \ 1380 .TypeConstraint<T>("T"), \ 1381 MaxPoolingWithArgmaxOp<GPUDevice, T>); \ 1382 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") \ 1383 .Device(DEVICE_GPU) \ 1384 .TypeConstraint<T>("T") \ 1385 .TypeConstraint<int64>("Targmax"), \ 1386 MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \ 1387 REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax") \ 1388 .Device(DEVICE_GPU) \ 1389 .TypeConstraint<T>("T") \ 1390 .TypeConstraint<int64>("Targmax"), \ 1391 MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>); 1392 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS); 1393 1394 // TODO(b/65847473): Re-enable once the underlying build error is fixed. 1395 #if !defined(PLATFORM_WINDOWS) 1396 REGISTER_KERNEL_BUILDER( 1397 Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<qint8>("T"), 1398 MaxPoolingNoMaskOp<GPUDevice, qint8>); 1399 1400 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") 1401 .Device(DEVICE_GPU) 1402 .HostMemory("ksize") 1403 .HostMemory("strides") 1404 .TypeConstraint<qint8>("T"), 1405 MaxPoolingV2Op<GPUDevice, qint8>); 1406 1407 REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") 1408 .Device(DEVICE_GPU) 1409 .HostMemory("ksize") 1410 .HostMemory("strides") 1411 .TypeConstraint<qint8>("T") 1412 .Label("eigen_tensor"), 1413 MaxPoolingV2Op<GPUDevice, qint8>); 1414 #endif // !defined(PLATFORM_WINDOWS) 1415 1416 #undef REGISTER_GPU_ONLY_POOL_KERNELS 1417 1418 #endif // GOOGLE_CUDA 1419 1420 #undef REGISTER_MAX_POOL_KERNELS 1421 1422 } // namespace tensorflow 1423