1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #define EIGEN_USE_THREADS 17 18 #include <algorithm> 19 #include <cmath> 20 21 #include "tensorflow/core/framework/numeric_op.h" 22 #include "tensorflow/core/framework/op_kernel.h" 23 #include "tensorflow/core/framework/register_types.h" 24 #include "tensorflow/core/framework/tensor.h" 25 #include "tensorflow/core/framework/tensor_shape.h" 26 #include "tensorflow/core/framework/tensor_types.h" 27 #include "tensorflow/core/framework/types.h" 28 #include "tensorflow/core/kernels/bounds_check.h" 29 #include "tensorflow/core/kernels/depthwise_conv_op.h" 30 #include "tensorflow/core/kernels/ops_util.h" 31 #include "tensorflow/core/lib/core/status.h" 32 #include "tensorflow/core/platform/logging.h" 33 #include "tensorflow/core/platform/types.h" 34 #include "tensorflow/core/util/padding.h" 35 #include "tensorflow/core/util/tensor_format.h" 36 #include "tensorflow/core/util/work_sharder.h" 37 38 #if GOOGLE_CUDA 39 #include "tensorflow/core/platform/stream_executor.h" 40 #endif // GOOGLE_CUDA 41 42 namespace tensorflow { 43 44 // Gradient operations for depthwise convolution. 45 46 typedef Eigen::ThreadPoolDevice CPUDevice; 47 typedef Eigen::GpuDevice GPUDevice; 48 49 // Common code between the two backward pass kernels: verifies that the 50 // dimensions all match and extract the padded rows and columns. 51 #define EXTRACT_AND_VERIFY_DIMENSIONS(label) \ 52 const Tensor& out_backprop = context->input(2); \ 53 OP_REQUIRES( \ 54 context, input_shape.dims() == 4, \ 55 errors::InvalidArgument(label, ": input must be 4-dimensional")); \ 56 OP_REQUIRES( \ 57 context, filter_shape.dims() == 4, \ 58 errors::InvalidArgument(label, ": filter must be 4-dimensional")); \ 59 OP_REQUIRES( \ 60 context, out_backprop.dims() == 4, \ 61 errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \ 62 const int64 batch = input_shape.dim_size(0); \ 63 OP_REQUIRES( \ 64 context, batch == out_backprop.dim_size(0), \ 65 errors::InvalidArgument( \ 66 label, ": input and out_backprop must have the same batch size")); \ 67 const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \ 68 OP_REQUIRES( \ 69 context, \ 70 FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), \ 71 errors::InvalidArgument("Input rows too large")); \ 72 const int32 input_rows = static_cast<int32>(input_rows_raw); \ 73 const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \ 74 OP_REQUIRES( \ 75 context, \ 76 FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), \ 77 errors::InvalidArgument("Input cols too large")); \ 78 const int32 input_cols = static_cast<int32>(input_cols_raw); \ 79 const int64 filter_rows = filter_shape.dim_size(0); \ 80 const int64 filter_cols = filter_shape.dim_size(1); \ 81 const int64 output_rows_raw = \ 82 GetTensorDim(out_backprop.shape(), data_format_, 'H'); \ 83 OP_REQUIRES( \ 84 context, \ 85 FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()), \ 86 errors::InvalidArgument("Output rows too large")); \ 87 const int32 output_rows = static_cast<int32>(output_rows_raw); \ 88 const int64 output_cols_raw = \ 89 GetTensorDim(out_backprop.shape(), data_format_, 'W'); \ 90 OP_REQUIRES( \ 91 context, \ 92 FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()), \ 93 errors::InvalidArgument("Output cols too large")); \ 94 const int32 output_cols = static_cast<int32>(output_cols_raw); \ 95 const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C'); \ 96 OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \ 97 errors::InvalidArgument( \ 98 label, ": input and filter must have the same in_depth")); \ 99 const int64 depth_multiplier = filter_shape.dim_size(3); \ 100 const int64 out_depth_raw = \ 101 GetTensorDim(out_backprop.shape(), data_format_, 'C'); \ 102 OP_REQUIRES( \ 103 context, \ 104 FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()), \ 105 errors::InvalidArgument("Output depth too large")); \ 106 const int32 out_depth = static_cast<int32>(out_depth_raw); \ 107 OP_REQUIRES( \ 108 context, (depth_multiplier * in_depth) == out_depth, \ 109 errors::InvalidArgument( \ 110 label, ": depth_multiplier * in_depth not equal to out_depth")); \ 111 const auto stride = stride_; \ 112 int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; \ 113 OP_REQUIRES_OK(context, \ 114 GetWindowedOutputSize(input_rows, filter_rows, stride, \ 115 padding_, &out_rows, &pad_rows)); \ 116 OP_REQUIRES_OK(context, \ 117 GetWindowedOutputSize(input_cols, filter_cols, stride, \ 118 padding_, &out_cols, &pad_cols)); \ 119 OP_REQUIRES( \ 120 context, output_rows == out_rows, \ 121 errors::InvalidArgument( \ 122 label, ": Number of rows of out_backprop doesn't match computed: ", \ 123 "actual = ", output_rows, ", computed = ", out_rows)); \ 124 OP_REQUIRES( \ 125 context, output_cols == out_cols, \ 126 errors::InvalidArgument( \ 127 label, ": Number of cols of out_backprop doesn't match computed: ", \ 128 "actual = ", output_cols, ", computed = ", out_cols)); \ 129 DepthwiseArgs args; \ 130 args.batch = batch; \ 131 args.in_rows = input_rows; \ 132 args.in_cols = input_cols; \ 133 args.in_depth = in_depth; \ 134 args.filter_rows = filter_rows; \ 135 args.filter_cols = filter_cols; \ 136 args.depth_multiplier = depth_multiplier; \ 137 args.stride = stride; \ 138 args.pad_rows = pad_rows; \ 139 args.pad_cols = pad_cols; \ 140 args.out_rows = out_rows; \ 141 args.out_cols = out_cols; \ 142 args.out_depth = out_depth; \ 143 VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", " \ 144 << input_rows << ", " << input_cols << ", " << in_depth \ 145 << "]; Filter: [" << filter_rows << ", " << filter_cols << ", " \ 146 << in_depth << ", " << depth_multiplier << "]; stride = " << stride \ 147 << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols \ 148 << ", output: [" << batch << ", " << out_rows << ", " << out_cols \ 149 << ", " << out_depth << "]"; 150 151 // Copies data from local region in 'out_backprop' into 'buffer'. 152 // The local region coordinates are calculated as the set of output points which 153 // used the input point ('in_r', 'in_'c') as input during the forward pass. 154 // Rather than spatially reversing the filter, the input is reversed during 155 // the copy. The copied data is padded to vector register-width boundaries so 156 // that it is aligned for efficient traversal and vector multiply-add by the 157 // depthwise input kernel. 158 // 159 // EX: 160 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 161 // 162 // 'out_backprop': [batch, out_rows, out_cols, out_depth] 163 // 164 // [a00, a01, a10, a11] [a20, a21, b00, b01] 165 // [b10, b11, b20, b21] [...] 166 // [e00, e01, e10, e11] [e20, e21, f00, f01] 167 // [f10, f11, f20, f21] [...] 168 // 169 // 'buffer' (register boundaries shown): 170 // 171 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0 172 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1 173 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0 174 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1 175 // 176 template <typename T> 177 static void CopyOutputBackpropRegion(const DepthwiseArgs& args, 178 const int64 padded_filter_inner_dim_size, 179 const int64 in_r, const int64 in_c, 180 const T* out_backprop, T* buffer) { 181 typedef typename Eigen::internal::packet_traits<T>::type Packet; 182 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 183 184 const int64 stride = args.stride; 185 const int64 filter_rows = args.filter_rows; 186 const int64 filter_cols = args.filter_cols; 187 const int64 pad_rows = args.pad_rows; 188 const int64 pad_cols = args.pad_cols; 189 const int64 out_rows = args.out_rows; 190 const int64 out_cols = args.out_cols; 191 192 // Calculate the output spatial region which used point (in_r, in_c) as input. 193 const int64 out_r_start = std::max( 194 static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride); 195 const int64 out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride); 196 const int64 out_c_start = std::max( 197 static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride); 198 const int64 out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride); 199 200 // Zero-pad 'buffer' if output region is smaller than filter spatial size. 201 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 202 if ((out_r_end - out_r_start + 1) < args.filter_rows || 203 (out_c_end - out_c_start + 1) < args.filter_cols) { 204 memset(buffer, 0, 205 filter_spatial_size * padded_filter_inner_dim_size * sizeof(T)); 206 } 207 208 // Calculate vectorized and scalar (residual) lengths for 'in_depth'. 209 const int64 vectorized_size = (args.out_depth / kPacketSize) * kPacketSize; 210 const int64 scalar_size = args.out_depth % kPacketSize; 211 const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0; 212 213 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) { 214 const int64 f_r = in_r + pad_rows - out_r * stride; 215 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) { 216 const int64 f_c = in_c + pad_cols - out_c * stride; 217 const int64 buf_base = 218 (f_r * filter_cols + f_c) * padded_filter_inner_dim_size; 219 // Calculate index into 'out_backprop' for coordinate (out_r, out_c). 220 auto* out_bprop = 221 out_backprop + (out_r * args.out_cols + out_c) * args.out_depth; 222 223 // Copy vectorized portion of inner dimension into 'buffer'. 224 for (int64 d = 0; d < vectorized_size; d += kPacketSize) { 225 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d); 226 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v); 227 } 228 // Copy scalar portion of out_bprop to 'buffer' 229 for (int64 d = 0; d < scalar_size; ++d) { 230 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d]; 231 } 232 // Pad to vector-register width (if needed). 233 for (int64 d = 0; d < pad_size; ++d) { 234 buffer[buf_base + vectorized_size + scalar_size + d] = 235 static_cast<T>(0); 236 } 237 } 238 } 239 } 240 241 // Computes the vectorized product of 'buffer' and 'filter' and stores 242 // result in 'output' at location computed from 'in_r' and 'in_c'. 243 // If depth_multiplier is > 1, the intermediate output is reduced along 244 // the depth_multiplier dimension. 245 // 246 // EX: 247 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 248 // Both 'input_buffer' and 'filter' are padded to register-width boundaries. 249 // 250 // 'buffer' [rows, cols, in_depth, depth_multiplier] 251 // 252 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0 253 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1 254 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0 255 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1 256 // 257 // filter [rows, cols, in_depth, depth_multiplier] 258 // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0] 259 // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0] 260 // 261 // First output register [in_depth, depth_multiplier] 262 // [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) + 263 // ([e00, e01, e10, e11] x [u1, v1, w1, x1]) + 264 // ([b00, b01, b10, b11] x [u2, v2, w2, x2]) + 265 // ([a00, a01, a10, a11] x [u3, v3, w3, x3]) 266 // 267 // Reduction step along depth-multiplier dimension: 268 // 269 // [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0] 270 // 271 272 template <typename T> 273 static void ComputeBackpropInput(const DepthwiseArgs& args, 274 const int64 padded_filter_inner_dim_size, 275 const int64 in_r, const int64 in_c, 276 const T* filter, const T* buffer, 277 T* out_buffer, T* output) { 278 typedef typename Eigen::internal::packet_traits<T>::type Packet; 279 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 280 281 const int64 in_depth = args.in_depth; 282 const int64 depth_multiplier = args.depth_multiplier; 283 const int64 out_depth = args.out_depth; 284 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 285 286 // Calculate vectorized and scalar lengths of 'out_depth'. 287 const int64 output_vectorized_size = (out_depth / kPacketSize) * kPacketSize; 288 const int64 output_scalar_size = out_depth % kPacketSize; 289 290 // Calculate base index at which to begin writing output. 291 const int64 base_output_index = (in_r * args.in_cols + in_c) * in_depth; 292 293 // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is 294 // used to efficiently reduce output when 'depth_multiplier' > kPacketSize. 295 const int64 dm_vectorized_size = 296 (depth_multiplier / kPacketSize) * kPacketSize; 297 const int64 dm_scalar_size = depth_multiplier % kPacketSize; 298 299 for (int i = 0; i < output_vectorized_size; i += kPacketSize) { 300 // Reset accumulator. 301 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0)); 302 for (int j = 0; j < filter_spatial_size; ++j) { 303 // Calculate index. 304 const int64 index = i + j * padded_filter_inner_dim_size; 305 // Load filter. 306 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index); 307 // Load input. 308 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index); 309 // Vector multiply-add. 310 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum); 311 } 312 if (depth_multiplier == 1) { 313 // Write directly to the output. 314 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum); 315 } else { 316 // Buffer output for subsequent reduction step. 317 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum); 318 } 319 } 320 321 if (output_scalar_size > 0) { 322 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0)); 323 for (int j = 0; j < filter_spatial_size; ++j) { 324 const int64 index = 325 output_vectorized_size + j * padded_filter_inner_dim_size; 326 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index); 327 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index); 328 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum); 329 } 330 // Load accumulator into an array and loop through output. 331 T out_buf[kPacketSize]; 332 Eigen::internal::pstoreu<T>(out_buf, vaccum); 333 if (depth_multiplier == 1) { 334 // Write directly to the output. 335 for (int j = 0; j < output_scalar_size; ++j) { 336 output[base_output_index + output_vectorized_size + j] = out_buf[j]; 337 } 338 } else { 339 // Buffer output for subsequent reduction step. 340 for (int j = 0; j < output_scalar_size; ++j) { 341 out_buffer[output_vectorized_size + j] = out_buf[j]; 342 } 343 } 344 } 345 346 // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'. 347 if (depth_multiplier > 1) { 348 for (int64 d = 0; d < in_depth; ++d) { 349 const int64 index = d * args.depth_multiplier; 350 T accum = static_cast<T>(0); 351 for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) { 352 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm); 353 accum += Eigen::internal::predux(v); 354 } 355 // Copy scalar portion of replicated output. 356 for (int64 dm = 0; dm < dm_scalar_size; ++dm) { 357 accum += out_buffer[index + dm_vectorized_size + dm]; 358 } 359 // Copy to output. 360 output[base_output_index + d] = accum; 361 } 362 } 363 } 364 365 // Computes the depthwise conv2d backprop input of 'out_backprop' by 366 // 'depthwise_filter' and stores the result in 'in_backprop'. 367 template <typename T> 368 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> { 369 typedef typename Eigen::internal::packet_traits<T>::type Packet; 370 371 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 372 const T* out_backprop, const T* depthwise_filter, 373 T* in_backprop, TensorFormat data_format) { 374 OP_REQUIRES( 375 ctx, data_format == FORMAT_NHWC, 376 errors::Unimplemented( 377 "Depthwise convolution on CPU is only supported for NHWC format")); 378 379 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 380 381 // Pad 'depthwise_filter' to vector register width (if needed). 382 const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true; 383 Tensor padded_filter; 384 if (pad_filter) { 385 // Allocate space for padded filter. 386 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 387 const int64 padded_filter_inner_dim_size = 388 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize; 389 OP_REQUIRES_OK( 390 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, 391 TensorShape({filter_spatial_size, 392 padded_filter_inner_dim_size}), 393 &padded_filter)); 394 // Write out padded filter. 395 functor::DepthwiseFilterPadOp<T>()( 396 args, depthwise_filter, padded_filter.template flat<T>().data()); 397 } 398 const T* filter_data = 399 pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter; 400 401 // Computes one shard of depthwise conv2d backprop input. 402 auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop]( 403 int64 start, int64 limit) { 404 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 405 406 const int64 input_image_size = 407 args.in_rows * args.in_cols * args.in_depth; 408 const int64 output_image_size = 409 args.out_rows * args.out_cols * args.out_depth; 410 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 411 const int64 padded_filter_inner_dim_size = 412 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize; 413 414 // Allocate buffer to copy regions from 'out_backprop'. 415 Tensor out_bprop_buffer; 416 OP_REQUIRES_OK( 417 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, 418 TensorShape({filter_spatial_size, 419 padded_filter_inner_dim_size}), 420 &out_bprop_buffer)); 421 T* out_bprop_buf = out_bprop_buffer.template flat<T>().data(); 422 423 // Allocate buffer for intermediate results. 424 Tensor in_bprop_buffer; 425 OP_REQUIRES_OK( 426 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, 427 TensorShape({padded_filter_inner_dim_size}), 428 &in_bprop_buffer)); 429 T* in_bprop_buf = in_bprop_buffer.template flat<T>().data(); 430 431 for (int64 b = start; b < limit; ++b) { 432 for (int64 in_r = 0; in_r < args.in_rows; ++in_r) { 433 for (int64 in_c = 0; in_c < args.in_cols; ++in_c) { 434 // Populate 'out_bprop_buf' from local 'out_backprop' region. 435 CopyOutputBackpropRegion<T>( 436 args, padded_filter_inner_dim_size, in_r, in_c, 437 out_backprop + b * output_image_size, out_bprop_buf); 438 439 // Compute depthwise backprop input. 440 ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r, 441 in_c, filter_data, out_bprop_buf, 442 in_bprop_buf, 443 in_backprop + b * input_image_size); 444 } 445 } 446 } 447 }; 448 449 const int64 shard_cost = args.in_rows * args.in_cols * args.out_depth; 450 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); 451 Shard(worker_threads.num_threads, worker_threads.workers, args.batch, 452 shard_cost, shard); 453 } 454 }; 455 456 template <typename T> 457 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args, 458 const T* out_backprop, 459 const T* filter, 460 T* in_backprop) { 461 // Naive for loop as a reference point without concerns about performance. 462 for (int b = 0; b < args.batch; ++b) { 463 for (int in_r = 0; in_r < args.in_rows; ++in_r) { 464 for (int in_c = 0; in_c < args.in_cols; ++in_c) { 465 for (int in_d = 0; in_d < args.in_depth; ++in_d) { 466 T sum = 0; 467 const int stride = args.stride; 468 const int out_d_start = in_d * args.depth_multiplier; 469 const int out_d_end = out_d_start + args.depth_multiplier; 470 471 for (int out_d = out_d_start; out_d < out_d_end; ++out_d) { 472 const int out_r_start = std::max( 473 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride); 474 const int out_r_end = 475 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride); 476 477 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) { 478 const int out_c_start = std::max( 479 0, 480 (in_c - args.filter_cols + args.pad_cols + stride) / stride); 481 const int out_c_end = 482 std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride); 483 484 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) { 485 int f_r = in_r + args.pad_rows - out_r * stride; 486 int f_c = in_c + args.pad_cols - out_c * stride; 487 int filter_dm = out_d - out_d_start; 488 int out_backprop_offset = 489 out_d + 490 args.out_depth * 491 (out_c + args.out_cols * (out_r + args.out_rows * b)); 492 int filter_offset = 493 filter_dm + 494 args.depth_multiplier * 495 (in_d + args.in_depth * (f_c + args.filter_cols * f_r)); 496 sum += 497 out_backprop[out_backprop_offset] * filter[filter_offset]; 498 } 499 } 500 } 501 502 int in_backprop_offset = 503 in_d + 504 args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b)); 505 in_backprop[in_backprop_offset] = sum; 506 } 507 } 508 } 509 } 510 } 511 512 #if GOOGLE_CUDA 513 514 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, 515 Eigen::half>; 516 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>; 517 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>; 518 519 #endif // GOOGLE_CUDA 520 521 // Kernel to compute the input backprop for depthwise convolution. 522 template <typename Device, class T> 523 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel { 524 public: 525 explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context) 526 : OpKernel(context) { 527 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 528 OP_REQUIRES(context, strides_.size() == 4, 529 errors::InvalidArgument("Sliding window strides field must " 530 "specify 4 dimensions")); 531 532 string data_format; 533 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 534 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 535 errors::InvalidArgument("Invalid data format")); 536 537 stride_ = GetTensorDim(strides_, data_format_, 'H'); 538 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W'); 539 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 540 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 541 542 OP_REQUIRES(context, stride_ == stride_w, 543 errors::InvalidArgument( 544 "Current implementation only supports equal length " 545 "strides in the row and column dimensions.")); 546 OP_REQUIRES( 547 context, (stride_n == 1 && stride_c == 1), 548 errors::InvalidArgument("Current implementation does not yet support " 549 "strides in the batch and depth dimensions.")); 550 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 551 } 552 553 void Compute(OpKernelContext* context) override { 554 const Tensor& input_sizes = context->input(0); 555 const Tensor& filter = context->input(1); 556 OP_REQUIRES( 557 context, TensorShapeUtils::IsVector(input_sizes.shape()), 558 errors::InvalidArgument( 559 "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", 560 input_sizes.dims())); 561 TensorShape input_shape; 562 const int32* in_sizes_data = input_sizes.template flat<int32>().data(); 563 for (int i = 0; i < input_sizes.NumElements(); ++i) { 564 OP_REQUIRES(context, in_sizes_data[i] >= 0, 565 errors::InvalidArgument("Dimension ", i, 566 " of input_sizes must be >= 0")); 567 input_shape.AddDim(in_sizes_data[i]); 568 } 569 const TensorShape& filter_shape = filter.shape(); 570 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput"); 571 Tensor* in_backprop = nullptr; 572 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 573 {0}, 0, input_shape, &in_backprop)); 574 auto out_backprop_ptr = out_backprop.template flat<T>().data(); 575 auto filter_ptr = filter.template flat<T>().data(); 576 auto in_backprop_ptr = in_backprop->template flat<T>().data(); 577 // If there is nothing to compute, return. 578 if (input_shape.num_elements() == 0) { 579 return; 580 } 581 LaunchDepthwiseConvBackpropInputOp<Device, T>()( 582 context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr, 583 data_format_); 584 } 585 586 private: 587 std::vector<int32> strides_; 588 Padding padding_; 589 TensorFormat data_format_; 590 int64 stride_; 591 592 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp); 593 }; 594 595 #define REGISTER_CPU_KERNEL(T) \ 596 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \ 597 .Device(DEVICE_CPU) \ 598 .TypeConstraint<T>("T"), \ 599 DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>); 600 TF_CALL_float(REGISTER_CPU_KERNEL); 601 TF_CALL_double(REGISTER_CPU_KERNEL); 602 #undef REGISTER_CPU_KERNEL 603 604 #if GOOGLE_CUDA 605 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") 606 .Device(DEVICE_GPU) 607 .TypeConstraint<float>("T") 608 .HostMemory("input_sizes"), 609 DepthwiseConv2dNativeBackpropInputOp<GPUDevice, float>); 610 611 REGISTER_KERNEL_BUILDER( 612 Name("DepthwiseConv2dNativeBackpropInput") 613 .Device(DEVICE_GPU) 614 .TypeConstraint<double>("T") 615 .HostMemory("input_sizes"), 616 DepthwiseConv2dNativeBackpropInputOp<GPUDevice, double>); 617 #endif // GOOGLE_CUDA 618 619 // Kernels to compute the gradients of the filters for depthwise convolution. 620 621 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the 622 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'. 623 // 624 // EX: 625 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 626 // Both 'input_buffer' and 'filter' are padded to register-width boundaries. 627 // 628 // 'input_buffer' [rows, cols, in_depth, depth_multiplier] 629 // 630 // [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0 631 // [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1 632 // [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0 633 // [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1 634 // 635 // 'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier] 636 // 637 // [q00, q01, q10, q11] [q20, q21, r00, r01] 638 // [r10, r11, r20, r21] [s00, s01, s10, s11] 639 // [s20, s21, t00, t01] [t10, t11, t20, a21] 640 // 641 // First output register of 'filter_backprop' 642 // [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11]) 643 // 644 template <typename T> 645 static void ComputeBackpropFilter(const DepthwiseArgs& args, 646 const int64 padded_out_depth_size, 647 const int64 out_r, const int64 out_c, 648 const T* out_backprop, const T* input_buffer, 649 T* output_buffer) { 650 typedef typename Eigen::internal::packet_traits<T>::type Packet; 651 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 652 // Calculate vectorized size of 'padded_out_depth_size'. 653 const int64 out_depth = args.out_depth; 654 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 655 const int64 output_vectorized_size = 656 (padded_out_depth_size / kPacketSize) * kPacketSize; 657 const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth; 658 // Determine whether we can execute fast or slow code path. 659 const int64 output_image_size = 660 args.out_rows * args.out_cols * args.out_depth; 661 const int64 output_last_vector_index = 662 output_image_size - (filter_spatial_size * padded_out_depth_size); 663 const bool fast_path = base_output_index <= output_last_vector_index; 664 665 if (fast_path) { 666 // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can 667 // amortize the cost of 'output_buffer' load store in the loop below. 668 for (int i = 0; i < output_vectorized_size; i += kPacketSize) { 669 // Load vector register from 'out_backprop'. 670 const auto out_bprop_block = 671 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i); 672 for (int j = 0; j < filter_spatial_size; ++j) { 673 const int64 index = i + j * padded_out_depth_size; 674 // Load vector register from 'input_buffer'. 675 const auto input_block = 676 Eigen::internal::ploadu<Packet>(input_buffer + index); 677 // Load output block into vector register. 678 auto out_block_data = output_buffer + index; 679 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data); 680 // Vector multiply-add. 681 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, 682 out_block); 683 // Store 'out_block' back to memory. 684 Eigen::internal::pstoreu<T>(out_block_data, out_block); 685 } 686 } 687 } else { 688 // Slow path (cant do vector reads from non-padded 'out_backprop'. 689 for (int i = 0; i < output_vectorized_size; i += kPacketSize) { 690 // Calculate safe read size from 'out_backprop'. 691 const int64 out_bprop_index = base_output_index + i; 692 const int64 out_bprop_limit = 693 std::min(output_image_size, out_bprop_index + kPacketSize); 694 T out_buf[kPacketSize]; 695 memset(&out_buf, 0, kPacketSize * sizeof(T)); 696 const int64 scalar_size = out_bprop_limit - out_bprop_index; 697 for (int64 j = 0; j < scalar_size; ++j) { 698 out_buf[j] = out_backprop[out_bprop_index + j]; 699 } 700 // Load vector register from 'out_buf'. 701 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf); 702 for (int j = 0; j < filter_spatial_size; ++j) { 703 const int64 index = i + j * padded_out_depth_size; 704 // Load vector register from 'input_buffer'. 705 const auto input_block = 706 Eigen::internal::ploadu<Packet>(input_buffer + index); 707 // Load output block into vector register. 708 auto out_block_data = output_buffer + index; 709 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data); 710 // Vector multiply-add. 711 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, 712 out_block); 713 // Store 'out_block' back to memory. 714 Eigen::internal::pstoreu<T>(out_block_data, out_block); 715 } 716 } 717 } 718 } 719 720 template <typename Device, typename T> 721 struct LaunchDepthwiseConvBackpropFilterOp; 722 723 template <typename T> 724 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> { 725 typedef typename Eigen::internal::packet_traits<T>::type Packet; 726 727 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 728 const T* out_backprop, const T* input, T* filter_backprop, 729 TensorFormat data_format) { 730 OP_REQUIRES( 731 ctx, data_format == FORMAT_NHWC, 732 errors::Unimplemented( 733 "Depthwise convolution on CPU is only supported for NHWC format")); 734 735 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 736 737 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 738 const int64 padded_out_depth_size = 739 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize; 740 741 // Allocate output buffers for each image in 'batch' (padded to vector 742 // register boundaries). 743 Tensor output_buffer; 744 OP_REQUIRES_OK( 745 ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, 746 TensorShape({args.batch, filter_spatial_size, 747 padded_out_depth_size}), 748 &output_buffer)); 749 T* output_buffer_data = output_buffer.template flat<T>().data(); 750 751 // Computes one shard of depthwise conv2d backprop filter. 752 auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data]( 753 int64 start, int64 limit) { 754 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 755 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 756 const int64 padded_out_depth_size = 757 ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize; 758 759 // Allocate buffer for local input regions. 760 Tensor input_buffer; 761 OP_REQUIRES_OK( 762 ctx, ctx->allocate_temp( 763 DataTypeToEnum<T>::value, 764 TensorShape({filter_spatial_size, padded_out_depth_size}), 765 &input_buffer)); 766 T* input_buffer_data = input_buffer.template flat<T>().data(); 767 768 const int64 input_image_size = 769 args.in_rows * args.in_cols * args.in_depth; 770 const int64 output_image_size = 771 args.out_rows * args.out_cols * args.out_depth; 772 const int64 padded_filter_size = 773 filter_spatial_size * padded_out_depth_size; 774 775 for (int b = start; b < limit; ++b) { 776 // Initialize 'output_buffer' for 'b'. 777 auto* output_buffer = output_buffer_data + b * padded_filter_size; 778 memset(output_buffer, 0, padded_filter_size * sizeof(T)); 779 780 for (int out_r = 0; out_r < args.out_rows; ++out_r) { 781 for (int out_c = 0; out_c < args.out_cols; ++out_c) { 782 // Populate 'input_buffer_data' with data from local input region. 783 functor::DepthwiseInputCopyOp<T>()( 784 args, padded_out_depth_size, out_r, out_c, 785 input + b * input_image_size, input_buffer_data); 786 // Compute depthwise backprop filter. 787 ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c, 788 out_backprop + b * output_image_size, 789 input_buffer_data, output_buffer); 790 } 791 } 792 } 793 }; 794 const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth; 795 auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); 796 Shard(worker_threads.num_threads, worker_threads.workers, args.batch, 797 shard_cost, shard); 798 799 // Accumulate 'output_buffer' from each shard into 'output'. 800 const int64 out_depth = args.out_depth; 801 const int64 vectorized_size = (out_depth / kPacketSize) * kPacketSize; 802 const int64 scalar_size = out_depth - vectorized_size; 803 const int64 padded_filter_size = 804 filter_spatial_size * padded_out_depth_size; 805 memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T)); 806 807 for (int64 i = 0; i < filter_spatial_size; ++i) { 808 const int64 buffer_base = i * padded_out_depth_size; 809 const int64 output_base = i * out_depth; 810 // Write vectorized length of filter's inner dimension to output. 811 for (int64 j = 0; j < vectorized_size; j += kPacketSize) { 812 // Load data from 'filter_backprop' into vector register. 813 auto out_block_data = filter_backprop + output_base + j; 814 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data); 815 for (int b = 0; b < args.batch; ++b) { 816 // Load data from 'output_buffer' for 'b'. 817 const auto* output_buffer = 818 output_buffer_data + b * padded_filter_size; 819 const auto v = 820 Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j); 821 // Add 'v' to 'out_block'. 822 out_block = Eigen::internal::padd<Packet>(out_block, v); 823 } 824 // Store 'out_block' back to memory. 825 Eigen::internal::pstoreu<T>(out_block_data, out_block); 826 } 827 // Write scalar length of filter's inner dimension to output. 828 for (int64 j = 0; j < scalar_size; ++j) { 829 for (int b = 0; b < args.batch; ++b) { 830 const auto* output_buffer = 831 output_buffer_data + b * padded_filter_size; 832 filter_backprop[output_base + vectorized_size + j] += 833 output_buffer[buffer_base + vectorized_size + j]; 834 } 835 } 836 } 837 } 838 }; 839 840 template <typename T> 841 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args, 842 const T* out_backprop, 843 const T* input, 844 T* filter_backprop) { 845 int num_filter_backprop = args.filter_rows * args.filter_cols * 846 args.in_depth * args.depth_multiplier; 847 memset(filter_backprop, 0, num_filter_backprop * sizeof(T)); 848 // Naive for loop as a reference point without concerns about performance. 849 for (int b = 0; b < args.batch; ++b) { 850 for (int out_r = 0; out_r < args.out_rows; ++out_r) { 851 for (int out_c = 0; out_c < args.out_cols; ++out_c) { 852 for (int out_d = 0; out_d < args.out_depth; ++out_d) { 853 const int in_d = out_d / args.depth_multiplier; 854 const int dm = out_d % args.depth_multiplier; 855 const int in_r_start = out_r * args.stride - args.pad_rows; 856 const int in_c_start = out_c * args.stride - args.pad_cols; 857 858 for (int f_r = 0; f_r < args.filter_rows; ++f_r) { 859 for (int f_c = 0; f_c < args.filter_cols; ++f_c) { 860 const int in_r = in_r_start + f_r; 861 const int in_c = in_c_start + f_c; 862 863 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && 864 in_c < args.in_cols) { 865 int out_backprop_offset = 866 out_d + 867 args.out_depth * 868 (out_c + args.out_cols * (out_r + args.out_rows * b)); 869 int input_offset = 870 in_d + 871 args.in_depth * 872 (in_c + args.in_cols * (in_r + args.in_rows * b)); 873 int filter_backprop_offset = 874 dm + 875 args.depth_multiplier * 876 (in_d + args.in_depth * (f_c + args.filter_cols * f_r)); 877 filter_backprop[filter_backprop_offset] += 878 input[input_offset] * out_backprop[out_backprop_offset]; 879 } 880 } 881 } 882 } 883 } 884 } 885 } 886 } 887 888 #if GOOGLE_CUDA 889 890 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, 891 Eigen::half>; 892 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>; 893 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>; 894 895 #endif // GOOGLE_CUDA 896 897 // Kernel to compute the filter backprop for depthwise convolution. 898 template <typename Device, class T> 899 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel { 900 public: 901 explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context) 902 : OpKernel(context) { 903 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 904 OP_REQUIRES(context, strides_.size() == 4, 905 errors::InvalidArgument("Sliding window strides field must " 906 "specify 4 dimensions")); 907 908 string data_format; 909 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 910 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 911 errors::InvalidArgument("Invalid data format")); 912 913 stride_ = GetTensorDim(strides_, data_format_, 'H'); 914 const int64 stride_w = GetTensorDim(strides_, data_format_, 'W'); 915 const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 916 const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 917 918 OP_REQUIRES(context, stride_ == stride_w, 919 errors::InvalidArgument( 920 "Current implementation only supports equal length " 921 "strides in the row and column dimensions.")); 922 OP_REQUIRES( 923 context, (stride_n == 1 && stride_c == 1), 924 errors::InvalidArgument("Current implementation does not yet support " 925 "strides in the batch and depth dimensions.")); 926 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 927 } 928 929 void Compute(OpKernelContext* context) override { 930 const Tensor& input = context->input(0); 931 const Tensor& filter_sizes = context->input(1); 932 OP_REQUIRES( 933 context, TensorShapeUtils::IsVector(filter_sizes.shape()), 934 errors::InvalidArgument( 935 "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ", 936 filter_sizes.dims())); 937 TensorShape filter_shape; 938 const int32* filter_sizes_data = filter_sizes.template flat<int32>().data(); 939 for (int i = 0; i < filter_sizes.NumElements(); ++i) { 940 OP_REQUIRES(context, filter_sizes_data[i] >= 0, 941 errors::InvalidArgument("Dimension ", i, 942 " of filter_sizes must be >= 0")); 943 filter_shape.AddDim(filter_sizes_data[i]); 944 } 945 const TensorShape& input_shape = input.shape(); 946 947 EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter"); 948 Tensor* filter_backprop = nullptr; 949 OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( 950 {1}, 0, filter_shape, &filter_backprop)); 951 952 auto out_backprop_ptr = out_backprop.template flat<T>().data(); 953 auto input_ptr = input.template flat<T>().data(); 954 auto filter_backprop_ptr = filter_backprop->template flat<T>().data(); 955 // If there is nothing to compute, return. 956 if (filter_shape.num_elements() == 0) { 957 return; 958 } 959 LaunchDepthwiseConvBackpropFilterOp<Device, T>()( 960 context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr, 961 data_format_); 962 } 963 964 private: 965 std::vector<int32> strides_; 966 Padding padding_; 967 TensorFormat data_format_; 968 int64 stride_; 969 970 TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp); 971 }; 972 973 #define REGISTER_CPU_KERNEL(T) \ 974 REGISTER_KERNEL_BUILDER( \ 975 Name("DepthwiseConv2dNativeBackpropFilter") \ 976 .Device(DEVICE_CPU) \ 977 .TypeConstraint<T>("T"), \ 978 DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>); 979 TF_CALL_float(REGISTER_CPU_KERNEL); 980 TF_CALL_double(REGISTER_CPU_KERNEL); 981 #undef REGISTER_CPU_KERNEL 982 983 #if GOOGLE_CUDA 984 REGISTER_KERNEL_BUILDER( 985 Name("DepthwiseConv2dNativeBackpropFilter") 986 .Device(DEVICE_GPU) 987 .TypeConstraint<float>("T") 988 .HostMemory("filter_sizes"), 989 DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, float>); 990 991 REGISTER_KERNEL_BUILDER( 992 Name("DepthwiseConv2dNativeBackpropFilter") 993 .Device(DEVICE_GPU) 994 .TypeConstraint<double>("T") 995 .HostMemory("filter_sizes"), 996 DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, double>); 997 #endif // GOOGLE_CUDA 998 999 } // namespace tensorflow 1000