1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // LRN = Local Response Normalization 17 // See docs in ../ops/nn_ops.cc. 18 19 #define EIGEN_USE_THREADS 20 21 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 22 #include "tensorflow/core/framework/bounds_check.h" 23 #include "tensorflow/core/framework/op_kernel.h" 24 #include "tensorflow/core/framework/register_types.h" 25 #include "tensorflow/core/framework/tensor.h" 26 #include "tensorflow/core/kernels/ops_util.h" 27 #include "tensorflow/core/lib/core/errors.h" 28 29 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 30 #include "tensorflow/core/kernels/eigen_contraction_kernel.h" 31 #endif 32 33 #if !defined(IS_MOBILE_PLATFORM) 34 #include "tensorflow/core/util/work_sharder.h" 35 #endif 36 37 #if GOOGLE_CUDA 38 #include "cuda/include/cuda.h" 39 #include "tensorflow/core/platform/stream_executor.h" 40 #include "tensorflow/core/util/stream_executor_util.h" 41 #endif // GOOGLE_CUDA 42 43 namespace tensorflow { 44 45 namespace { 46 47 // When the depth is large and beta_ is 0.5 or 1.0, Single-threaded 48 // LRN is faster than the main band matrix approach used 49 // below. Benchmarks suggest switching to SingleThreadedLRN when depth > 384. 50 const int kSingleThreadedLRNDepthCutoff = 384; 51 52 // Create a depth-by-depth band matrix with 1s along a swath of size (2 * 53 // depth_radius + 1) around the diagonal. 54 template <typename T> 55 void GetBandMatrix(int depth, int depth_radius, 56 Eigen::Tensor<T, 2, Eigen::RowMajor>* result) { 57 result->setZero(); 58 for (int row = 0; row < depth; ++row) { 59 const int begin = std::max<int>(0, row - depth_radius); 60 const int end = std::min<int>(depth, row + depth_radius + 1); 61 Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin); 62 Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin); 63 result->slice(start, sizes).setConstant(T(1)); 64 } 65 } 66 67 } // namespace 68 69 typedef Eigen::ThreadPoolDevice CPUDevice; 70 typedef Eigen::GpuDevice GPUDevice; 71 72 template <typename Device, typename T> 73 struct LaunchLRN; 74 75 template <typename T> 76 struct LaunchLRN<CPUDevice, T> { 77 LaunchLRN(int depth_radius, T bias, T alpha, T beta) 78 : depth_radius_(depth_radius), bias_(bias), alpha_(alpha), beta_(beta) {} 79 80 void launch(OpKernelContext* context, OpKernel* kernel, const Tensor& in, 81 Tensor* output) { 82 const int batch = static_cast<int>(in.dim_size(0)); 83 const int rows = static_cast<int>(in.dim_size(1)); 84 const int cols = static_cast<int>(in.dim_size(2)); 85 const int depth = static_cast<int>(in.dim_size(3)); 86 87 #if defined(IS_MOBILE_PLATFORM) 88 SingleThreadedLRN(in, batch, rows, cols, depth, output); 89 #else 90 const int nodes = cols * rows; 91 if (depth > kSingleThreadedLRNDepthCutoff && 92 (beta_ == T(0.5) || beta_ == T(1))) { 93 SingleThreadedLRN(in, batch, rows, cols, depth, output); 94 return; 95 } 96 97 auto in_shaped = in.shaped<T, 2>({nodes * batch, depth}); 98 99 // Multiplying the input with the band matrix has the effect of reducing the 100 // correct patch along the depth. 101 Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth); 102 GetBandMatrix<T>(depth, depth_radius_, &multiplier); 103 104 auto out_shaped = output->shaped<T, 2>({nodes * batch, depth}); 105 Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; 106 auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_; 107 if (beta_ == T(1)) { 108 out_shaped.device(context->eigen_cpu_device()) = 109 in_shaped * tmp.inverse(); 110 } else if (beta_ == T(0.5)) { 111 out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt(); 112 } else { 113 out_shaped.device(context->eigen_cpu_device()) = 114 in_shaped * (tmp.log() * -beta_).exp(); 115 } 116 #endif 117 } 118 119 private: 120 typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair; 121 122 void SingleThreadedLRN(const Tensor& in, const int batch, const int rows, 123 const int cols, const int depth, Tensor* out) { 124 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> data_in( 125 in.flat<T>().data(), depth, batch * rows * cols); 126 127 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> data_out( 128 out->flat<T>().data(), depth, batch * rows * cols); 129 130 const int double_depth_radius = depth_radius_ * 2; 131 Eigen::Matrix<T, Eigen::Dynamic, 1> padded_square(data_in.rows() + 132 double_depth_radius); 133 padded_square.setZero(); 134 for (int r = 0; r < data_in.cols(); ++r) { 135 // Do local response normalization for data_in(:, r). First, compute the 136 // square and store them in buffer for repeated use. 137 padded_square.block(depth_radius_, 0, data_out.rows(), 1) = 138 data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_; 139 // Then, compute the scale and write it to data_out. 140 T accumulated_scale(0); 141 for (int i = 0; i < double_depth_radius; ++i) { 142 accumulated_scale += padded_square(i); 143 } 144 for (int i = 0; i < data_in.rows(); ++i) { 145 accumulated_scale += padded_square(i + double_depth_radius); 146 data_out(i, r) = bias_ + accumulated_scale; 147 accumulated_scale -= padded_square(i); 148 } 149 } 150 151 if (beta_ == T(1)) { 152 data_out.array() = data_in.array() * data_out.array().inverse(); 153 } else if (beta_ == T(0.5)) { 154 data_out.array() = data_in.array() * data_out.array().rsqrt(); 155 } else { 156 data_out.array() = 157 data_in.array() * (data_out.array().log() * -beta_).exp(); 158 } 159 } 160 161 int depth_radius_; 162 T bias_; 163 T alpha_; 164 T beta_; 165 }; 166 167 #if GOOGLE_CUDA 168 169 template <typename T> 170 struct LaunchLRN<GPUDevice, T> { 171 LaunchLRN(int depth_radius, T bias, T alpha, T beta) 172 : depth_radius_(depth_radius), bias_(bias), alpha_(alpha), beta_(beta) {} 173 174 void launch(OpKernelContext* context, OpKernel* kernel, const Tensor& in, 175 Tensor* output) { 176 OP_REQUIRES( 177 context, beta_ >= 0.01, 178 errors::InvalidArgument("cuDNN requires beta >= 0.01, got: ", beta_)); 179 180 OP_REQUIRES( 181 context, depth_radius_ > 0 && depth_radius_ <= 7, 182 errors::InvalidArgument("cuDNN requires depth_radius in [1, 7], got: ", 183 depth_radius_)); 184 OP_REQUIRES( 185 context, bias_ >= 1e-5, 186 errors::InvalidArgument("cuDNN requires bias >= 1e-5, got: ", bias_)); 187 188 // Cast to platform-specific int to avoid conversion warnings. 189 const int batch = static_cast<int>(in.dim_size(0)); 190 const int rows = static_cast<int>(in.dim_size(1)); 191 const int cols = static_cast<int>(in.dim_size(2)); 192 const int depth = static_cast<int>(in.dim_size(3)); 193 194 se::dnn::BatchDescriptor dimensions_desc; 195 dimensions_desc.set_count(batch) 196 .set_height(rows) 197 .set_width(cols) 198 .set_feature_map_count(depth) 199 .set_layout(se::dnn::DataLayout::kBatchYXDepth); 200 201 se::dnn::NormalizeDescriptor normalize_desc; 202 normalize_desc.set_bias(bias_) 203 .set_range(depth_radius_) 204 .set_alpha(alpha_) 205 .set_beta(beta_); 206 207 auto input_data = StreamExecutorUtil::AsDeviceMemory<T>(in); 208 auto output_data = StreamExecutorUtil::AsDeviceMemory<T>(*output); 209 210 auto* stream = context->op_device_context()->stream(); 211 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); 212 213 bool status = 214 stream 215 ->ThenNormalizeWithDimensions(normalize_desc, dimensions_desc, 216 input_data, &output_data) 217 .ok(); 218 OP_REQUIRES(context, status, 219 errors::Internal("NormalizeWithDimensions launch failed")); 220 } 221 222 int depth_radius_; 223 T bias_; 224 T alpha_; 225 T beta_; 226 }; 227 228 #endif // GOOGLE_CUDA 229 230 template <typename Device, typename T> 231 class LRNOp : public OpKernel { 232 public: 233 explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) { 234 int64 depth_radius64; 235 OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); 236 OP_REQUIRES( 237 context, 238 FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()), 239 errors::InvalidArgument("depth_radius = ", depth_radius64, 240 " larger than int max")); 241 depth_radius_ = static_cast<int>(depth_radius64); 242 float tmp; 243 OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp)); 244 bias_ = T(tmp); 245 OP_REQUIRES_OK(context, context->GetAttr("alpha", &tmp)); 246 alpha_ = T(tmp); 247 OP_REQUIRES_OK(context, context->GetAttr("beta", &tmp)); 248 beta_ = T(tmp); 249 } 250 251 void Compute(OpKernelContext* context) override { 252 const Tensor& in = context->input(0); 253 OP_REQUIRES(context, in.dims() == 4, 254 errors::InvalidArgument("in must be 4-dimensional")); 255 OP_REQUIRES( 256 context, 257 FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()), 258 errors::InvalidArgument("argument to LRN too large")); 259 // Cast to platform-specific int to avoid conversion warnings. 260 const int batch = static_cast<int>(in.dim_size(0)); 261 const int rows = static_cast<int>(in.dim_size(1)); 262 const int cols = static_cast<int>(in.dim_size(2)); 263 const int depth = static_cast<int>(in.dim_size(3)); 264 265 OP_REQUIRES(context, 266 (depth + depth_radius_) <= std::numeric_limits<int>::max(), 267 errors::InvalidArgument("depth ", depth, " + depth_radius ", 268 depth_radius_, " exceeds int max.")); 269 270 Tensor* output = nullptr; 271 OP_REQUIRES_OK(context, 272 context->allocate_output( 273 0, TensorShape({batch, rows, cols, depth}), &output)); 274 275 LaunchLRN<Device, T> launcher(depth_radius_, bias_, alpha_, beta_); 276 launcher.launch(context, this, in, output); 277 } 278 279 private: 280 int depth_radius_; 281 T bias_; 282 T alpha_; 283 T beta_; 284 }; 285 286 #define REGISTER_CPU(T) \ 287 REGISTER_KERNEL_BUILDER( \ 288 Name("LRN").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 289 LRNOp<CPUDevice, T>); 290 TF_CALL_float(REGISTER_CPU); 291 TF_CALL_half(REGISTER_CPU); 292 293 #undef REGISTER_CPU 294 295 #if GOOGLE_CUDA 296 297 #define REGISTER_GPU(T) \ 298 REGISTER_KERNEL_BUILDER( \ 299 Name("LRN").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ 300 LRNOp<GPUDevice, T>); 301 TF_CALL_float(REGISTER_GPU); 302 303 #undef REGISTER_GPU 304 305 #endif // GOOGLE_CUDA 306 307 #if !defined(IS_MOBILE_PLATFORM) 308 309 template <typename Device, typename T> 310 struct LaunchLRNGrad; 311 312 template <typename T> 313 struct LaunchLRNGrad<CPUDevice, T> { 314 LaunchLRNGrad(int depth_radius, T bias, T alpha, T beta) 315 : depth_radius_(depth_radius), bias_(bias), alpha_(alpha), beta_(beta) {} 316 317 void launch(OpKernelContext* context, OpKernel* kernel, 318 const Tensor& in_grads, const Tensor& in_image, 319 const Tensor& out_image, Tensor* output) { 320 const int64 batch = in_grads.dim_size(0); 321 const int64 rows = in_grads.dim_size(1); 322 const int64 cols = in_grads.dim_size(2); 323 const int64 depth = in_grads.dim_size(3); 324 const auto nodes = cols * rows; 325 auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth}); 326 auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth}); 327 auto activations = out_image.shaped<T, 2>({nodes * batch, depth}); 328 329 auto out_shaped = output->shaped<T, 2>({nodes * batch, depth}); 330 out_shaped.setZero(); 331 332 auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, 333 depth](int64 begin, int64 end) { 334 for (int64 i = begin; i < end; ++i) { 335 for (int64 j = 0; j < depth; ++j) { 336 // Let y be the LRN activations and x be the inputs along the depth 337 // dimension. (LRN operates independently along rows, cols, and 338 // batch). 339 // We have 340 // yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius} 341 // x_j^2))^beta 342 // 343 // Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius} 344 // x_j^2)) 345 // dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta) 346 // dy_i/dx_j = ( - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta) 347 // 348 // NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta). 349 // However, this is numerically unstable for small values of xi. We 350 // compute N explicitly here to avoid that. 351 352 int64 depth_begin = std::max<int64>(0, j - depth_radius_); 353 int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1); 354 355 T norm(0); 356 for (int64 k = depth_begin; k < depth_end; ++k) { 357 norm += in_shaped(i, k) * in_shaped(i, k); 358 } 359 norm = alpha_ * norm + bias_; 360 DCHECK_GT(norm, T(1e-6)); 361 for (int64 k = depth_begin; k < depth_end; ++k) { 362 T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) * 363 activations(i, j) / norm; 364 if (k == j) { 365 dyi += Eigen::numext::pow(norm, -beta_); 366 } 367 dyi *= grads_shaped(i, j); 368 const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi; 369 } 370 } 371 } 372 }; 373 auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); 374 Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, 375 depth * depth, shard); 376 } 377 378 int depth_radius_; 379 T bias_; 380 T alpha_; 381 T beta_; 382 }; 383 384 #if GOOGLE_CUDA 385 386 template <typename T> 387 struct LaunchLRNGrad<GPUDevice, T> { 388 LaunchLRNGrad(int depth_radius, T bias, T alpha, T beta) 389 : depth_radius_(depth_radius), bias_(bias), alpha_(alpha), beta_(beta) {} 390 391 void launch(OpKernelContext* context, OpKernel* kernel, 392 const Tensor& in_grads, const Tensor& in_image, 393 const Tensor& out_image, Tensor* output) { 394 OP_REQUIRES( 395 context, beta_ >= 0.01, 396 errors::InvalidArgument("cuDNN requires beta >= 0.01, got: ", beta_)); 397 398 OP_REQUIRES( 399 context, depth_radius_ > 0 && depth_radius_ <= 7, 400 errors::InvalidArgument("cuDNN requires depth_radius in [1, 7], got: ", 401 depth_radius_)); 402 OP_REQUIRES( 403 context, bias_ >= 1e-5, 404 errors::InvalidArgument("cuDNN requires bias >= 1e-5, got: ", bias_)); 405 406 const int64 batch = in_grads.dim_size(0); 407 const int64 rows = in_grads.dim_size(1); 408 const int64 cols = in_grads.dim_size(2); 409 const int64 depth = in_grads.dim_size(3); 410 411 se::dnn::BatchDescriptor dimensions_desc; 412 dimensions_desc.set_count(batch) 413 .set_height(rows) 414 .set_width(cols) 415 .set_feature_map_count(depth) 416 .set_layout(se::dnn::DataLayout::kBatchYXDepth); 417 418 se::dnn::NormalizeDescriptor normalize_desc; 419 normalize_desc.set_bias(bias_) 420 .set_range(depth_radius_) 421 .set_alpha(alpha_) 422 .set_beta(beta_); 423 424 auto input_grads_data = StreamExecutorUtil::AsDeviceMemory<T>(in_grads); 425 auto input_image_data = StreamExecutorUtil::AsDeviceMemory<T>(in_image); 426 auto output_image_data = StreamExecutorUtil::AsDeviceMemory<T>(out_image); 427 auto output_grads_data = StreamExecutorUtil::AsDeviceMemory<T>(*output); 428 429 auto* stream = context->op_device_context()->stream(); 430 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); 431 432 bool status = 433 stream 434 ->ThenNormalizeBackwardWithDimensions( 435 normalize_desc, dimensions_desc, input_image_data, 436 output_image_data, input_grads_data, &output_grads_data) 437 .ok(); 438 OP_REQUIRES( 439 context, status, 440 errors::Internal("NormalizeBackwardWithDimensions launch failed")); 441 } 442 443 int depth_radius_; 444 T bias_; 445 T alpha_; 446 T beta_; 447 }; 448 449 #endif // GOOGLE_CUDA 450 451 template <typename Device, typename T> 452 class LRNGradOp : public OpKernel { 453 public: 454 explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) { 455 int64 depth_radius64; 456 OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); 457 OP_REQUIRES( 458 context, 459 FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()), 460 errors::InvalidArgument("depth_radius = ", depth_radius64, 461 " larger than int max")); 462 depth_radius_ = static_cast<int>(depth_radius64); 463 float tmp; 464 OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp)); 465 bias_ = T(tmp); 466 OP_REQUIRES_OK(context, context->GetAttr("alpha", &tmp)); 467 alpha_ = T(tmp); 468 OP_REQUIRES_OK(context, context->GetAttr("beta", &tmp)); 469 beta_ = T(tmp); 470 } 471 472 void Compute(OpKernelContext* context) override { 473 const Tensor& in_grads = context->input(0); 474 const Tensor& in_image = context->input(1); 475 const Tensor& out_image = context->input(2); 476 477 OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4, 478 errors::InvalidArgument("inputs must be 4-dimensional")); 479 const int64 batch = in_grads.dim_size(0); 480 const int64 rows = in_grads.dim_size(1); 481 const int64 cols = in_grads.dim_size(2); 482 const int64 depth = in_grads.dim_size(3); 483 OP_REQUIRES( 484 context, 485 in_image.dim_size(0) == batch && in_image.dim_size(1) == rows && 486 in_image.dim_size(2) == cols && in_image.dim_size(3) == depth && 487 out_image.dim_size(0) == batch && out_image.dim_size(1) == rows && 488 out_image.dim_size(2) == cols && out_image.dim_size(3) == depth, 489 errors::InvalidArgument( 490 "input_grads, input_image, and out_image should have the same " 491 "shape")); 492 493 Tensor* output = nullptr; 494 OP_REQUIRES_OK(context, 495 context->allocate_output( 496 0, TensorShape({batch, rows, cols, depth}), &output)); 497 498 LaunchLRNGrad<Device, T> launcher(depth_radius_, bias_, alpha_, beta_); 499 launcher.launch(context, this, in_grads, in_image, out_image, output); 500 } 501 502 private: 503 int depth_radius_; 504 T bias_; 505 T alpha_; 506 T beta_; 507 }; 508 509 #define REGISTER_CPU(T) \ 510 REGISTER_KERNEL_BUILDER( \ 511 Name("LRNGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 512 LRNGradOp<CPUDevice, T>); 513 TF_CALL_float(REGISTER_CPU); 514 TF_CALL_half(REGISTER_CPU); 515 516 #undef REGISTER_CPU 517 518 #if GOOGLE_CUDA 519 520 #define REGISTER_GPU(T) \ 521 REGISTER_KERNEL_BUILDER( \ 522 Name("LRNGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ 523 LRNGradOp<GPUDevice, T>); 524 TF_CALL_float(REGISTER_GPU); 525 526 #undef REGISTER_GPU 527 528 #endif // GOOGLE_CUDA 529 530 #endif // !defined(IS_MOBILE_PLATFORM) 531 532 } // namespace tensorflow 533