Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #define EIGEN_USE_THREADS
     17 
     18 #include <algorithm>
     19 #include <cmath>
     20 
     21 #include "tensorflow/core/framework/numeric_op.h"
     22 #include "tensorflow/core/framework/op_kernel.h"
     23 #include "tensorflow/core/framework/register_types.h"
     24 #include "tensorflow/core/framework/tensor.h"
     25 #include "tensorflow/core/framework/tensor_shape.h"
     26 #include "tensorflow/core/framework/tensor_types.h"
     27 #include "tensorflow/core/framework/types.h"
     28 #include "tensorflow/core/kernels/bounds_check.h"
     29 #include "tensorflow/core/kernels/depthwise_conv_op.h"
     30 #include "tensorflow/core/kernels/ops_util.h"
     31 #include "tensorflow/core/lib/core/status.h"
     32 #include "tensorflow/core/platform/logging.h"
     33 #include "tensorflow/core/platform/types.h"
     34 #include "tensorflow/core/util/padding.h"
     35 #include "tensorflow/core/util/tensor_format.h"
     36 #include "tensorflow/core/util/work_sharder.h"
     37 
     38 #if GOOGLE_CUDA
     39 #include "tensorflow/core/platform/stream_executor.h"
     40 #endif  // GOOGLE_CUDA
     41 
     42 namespace tensorflow {
     43 
     44 // Gradient operations for depthwise convolution.
     45 
     46 typedef Eigen::ThreadPoolDevice CPUDevice;
     47 typedef Eigen::GpuDevice GPUDevice;
     48 
     49 // Common code between the two backward pass kernels: verifies that the
     50 // dimensions all match and extract the padded rows and columns.
     51 #define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
     52   const Tensor& out_backprop = context->input(2);                              \
     53   OP_REQUIRES(                                                                 \
     54       context, input_shape.dims() == 4,                                        \
     55       errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
     56   OP_REQUIRES(                                                                 \
     57       context, filter_shape.dims() == 4,                                       \
     58       errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
     59   OP_REQUIRES(                                                                 \
     60       context, out_backprop.dims() == 4,                                       \
     61       errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
     62   const int64 batch = input_shape.dim_size(0);                                 \
     63   OP_REQUIRES(                                                                 \
     64       context, batch == out_backprop.dim_size(0),                              \
     65       errors::InvalidArgument(                                                 \
     66           label, ": input and out_backprop must have the same batch size"));   \
     67   const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');   \
     68   OP_REQUIRES(                                                                 \
     69       context,                                                                 \
     70       FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
     71       errors::InvalidArgument("Input rows too large"));                        \
     72   const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
     73   const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');   \
     74   OP_REQUIRES(                                                                 \
     75       context,                                                                 \
     76       FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
     77       errors::InvalidArgument("Input cols too large"));                        \
     78   const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
     79   const int64 filter_rows = filter_shape.dim_size(0);                          \
     80   const int64 filter_cols = filter_shape.dim_size(1);                          \
     81   const int64 output_rows_raw =                                                \
     82       GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
     83   OP_REQUIRES(                                                                 \
     84       context,                                                                 \
     85       FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
     86       errors::InvalidArgument("Output rows too large"));                       \
     87   const int32 output_rows = static_cast<int32>(output_rows_raw);               \
     88   const int64 output_cols_raw =                                                \
     89       GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
     90   OP_REQUIRES(                                                                 \
     91       context,                                                                 \
     92       FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
     93       errors::InvalidArgument("Output cols too large"));                       \
     94   const int32 output_cols = static_cast<int32>(output_cols_raw);               \
     95   const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
     96   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
     97               errors::InvalidArgument(                                         \
     98                   label, ": input and filter must have the same in_depth"));   \
     99   const int64 depth_multiplier = filter_shape.dim_size(3);                     \
    100   const int64 out_depth_raw =                                                  \
    101       GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
    102   OP_REQUIRES(                                                                 \
    103       context,                                                                 \
    104       FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
    105       errors::InvalidArgument("Output depth too large"));                      \
    106   const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
    107   OP_REQUIRES(                                                                 \
    108       context, (depth_multiplier * in_depth) == out_depth,                     \
    109       errors::InvalidArgument(                                                 \
    110           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
    111   const auto stride = stride_;                                                 \
    112   int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;                \
    113   OP_REQUIRES_OK(context,                                                      \
    114                  GetWindowedOutputSize(input_rows, filter_rows, stride,        \
    115                                        padding_, &out_rows, &pad_rows));       \
    116   OP_REQUIRES_OK(context,                                                      \
    117                  GetWindowedOutputSize(input_cols, filter_cols, stride,        \
    118                                        padding_, &out_cols, &pad_cols));       \
    119   OP_REQUIRES(                                                                 \
    120       context, output_rows == out_rows,                                        \
    121       errors::InvalidArgument(                                                 \
    122           label, ": Number of rows of out_backprop doesn't match computed: ",  \
    123           "actual = ", output_rows, ", computed = ", out_rows));               \
    124   OP_REQUIRES(                                                                 \
    125       context, output_cols == out_cols,                                        \
    126       errors::InvalidArgument(                                                 \
    127           label, ": Number of cols of out_backprop doesn't match computed: ",  \
    128           "actual = ", output_cols, ", computed = ", out_cols));               \
    129   DepthwiseArgs args;                                                          \
    130   args.batch = batch;                                                          \
    131   args.in_rows = input_rows;                                                   \
    132   args.in_cols = input_cols;                                                   \
    133   args.in_depth = in_depth;                                                    \
    134   args.filter_rows = filter_rows;                                              \
    135   args.filter_cols = filter_cols;                                              \
    136   args.depth_multiplier = depth_multiplier;                                    \
    137   args.stride = stride;                                                        \
    138   args.pad_rows = pad_rows;                                                    \
    139   args.pad_cols = pad_cols;                                                    \
    140   args.out_rows = out_rows;                                                    \
    141   args.out_cols = out_cols;                                                    \
    142   args.out_depth = out_depth;                                                  \
    143   VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", "      \
    144           << input_rows << ", " << input_cols << ", " << in_depth              \
    145           << "]; Filter: [" << filter_rows << ", " << filter_cols << ", "      \
    146           << in_depth << ", " << depth_multiplier << "]; stride = " << stride  \
    147           << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols        \
    148           << ", output: [" << batch << ", " << out_rows << ", " << out_cols    \
    149           << ", " << out_depth << "]";
    150 
    151 // Copies data from local region in 'out_backprop' into 'buffer'.
    152 // The local region coordinates are calculated as the set of output points which
    153 // used the input point ('in_r', 'in_'c') as input during the forward pass.
    154 // Rather than spatially reversing the filter, the input is reversed during
    155 // the copy. The copied data is padded to vector register-width boundaries so
    156 // that it is aligned for efficient traversal and vector multiply-add by the
    157 // depthwise input kernel.
    158 //
    159 // EX:
    160 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
    161 //
    162 //   'out_backprop': [batch, out_rows, out_cols, out_depth]
    163 //
    164 //     [a00, a01, a10, a11] [a20, a21, b00, b01]
    165 //     [b10, b11, b20, b21] [...]
    166 //     [e00, e01, e10, e11] [e20, e21, f00, f01]
    167 //     [f10, f11, f20, f21] [...]
    168 //
    169 //   'buffer' (register boundaries shown):
    170 //
    171 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
    172 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
    173 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
    174 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
    175 //
    176 template <typename T>
    177 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
    178                                      const int64 padded_filter_inner_dim_size,
    179                                      const int64 in_r, const int64 in_c,
    180                                      const T* out_backprop, T* buffer) {
    181   typedef typename Eigen::internal::packet_traits<T>::type Packet;
    182   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    183 
    184   const int64 stride = args.stride;
    185   const int64 filter_rows = args.filter_rows;
    186   const int64 filter_cols = args.filter_cols;
    187   const int64 pad_rows = args.pad_rows;
    188   const int64 pad_cols = args.pad_cols;
    189   const int64 out_rows = args.out_rows;
    190   const int64 out_cols = args.out_cols;
    191 
    192   // Calculate the output spatial region which used point (in_r, in_c) as input.
    193   const int64 out_r_start = std::max(
    194       static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
    195   const int64 out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
    196   const int64 out_c_start = std::max(
    197       static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
    198   const int64 out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
    199 
    200   // Zero-pad 'buffer' if output region is smaller than filter spatial size.
    201   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    202   if ((out_r_end - out_r_start + 1) < args.filter_rows ||
    203       (out_c_end - out_c_start + 1) < args.filter_cols) {
    204     memset(buffer, 0,
    205            filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
    206   }
    207 
    208   // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
    209   const int64 vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
    210   const int64 scalar_size = args.out_depth % kPacketSize;
    211   const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
    212 
    213   for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
    214     const int64 f_r = in_r + pad_rows - out_r * stride;
    215     for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
    216       const int64 f_c = in_c + pad_cols - out_c * stride;
    217       const int64 buf_base =
    218           (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
    219       // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
    220       auto* out_bprop =
    221           out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
    222 
    223       // Copy vectorized portion of inner dimension into 'buffer'.
    224       for (int64 d = 0; d < vectorized_size; d += kPacketSize) {
    225         auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
    226         Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
    227       }
    228       // Copy scalar portion of out_bprop to 'buffer'
    229       for (int64 d = 0; d < scalar_size; ++d) {
    230         buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
    231       }
    232       // Pad to vector-register width (if needed).
    233       for (int64 d = 0; d < pad_size; ++d) {
    234         buffer[buf_base + vectorized_size + scalar_size + d] =
    235             static_cast<T>(0);
    236       }
    237     }
    238   }
    239 }
    240 
    241 // Computes the vectorized product of 'buffer' and 'filter' and stores
    242 // result in 'output' at location computed from 'in_r' and 'in_c'.
    243 // If depth_multiplier is > 1, the intermediate output is reduced along
    244 // the depth_multiplier dimension.
    245 //
    246 // EX:
    247 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
    248 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
    249 //
    250 //   'buffer' [rows, cols, in_depth, depth_multiplier]
    251 //
    252 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
    253 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
    254 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
    255 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
    256 //
    257 //   filter [rows, cols, in_depth, depth_multiplier]
    258 //     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
    259 //     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
    260 //
    261 //   First output register [in_depth, depth_multiplier]
    262 //     [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
    263 //                            ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
    264 //                            ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
    265 //                            ([a00, a01, a10, a11] x [u3, v3, w3, x3])
    266 //
    267 //   Reduction step along depth-multiplier dimension:
    268 //
    269 //     [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
    270 //
    271 
    272 template <typename T>
    273 static void ComputeBackpropInput(const DepthwiseArgs& args,
    274                                  const int64 padded_filter_inner_dim_size,
    275                                  const int64 in_r, const int64 in_c,
    276                                  const T* filter, const T* buffer,
    277                                  T* out_buffer, T* output) {
    278   typedef typename Eigen::internal::packet_traits<T>::type Packet;
    279   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    280 
    281   const int64 in_depth = args.in_depth;
    282   const int64 depth_multiplier = args.depth_multiplier;
    283   const int64 out_depth = args.out_depth;
    284   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    285 
    286   // Calculate vectorized and scalar lengths of 'out_depth'.
    287   const int64 output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
    288   const int64 output_scalar_size = out_depth % kPacketSize;
    289 
    290   // Calculate base index at which to begin writing output.
    291   const int64 base_output_index = (in_r * args.in_cols + in_c) * in_depth;
    292 
    293   // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
    294   // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
    295   const int64 dm_vectorized_size =
    296       (depth_multiplier / kPacketSize) * kPacketSize;
    297   const int64 dm_scalar_size = depth_multiplier % kPacketSize;
    298 
    299   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
    300     // Reset accumulator.
    301     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
    302     for (int j = 0; j < filter_spatial_size; ++j) {
    303       // Calculate index.
    304       const int64 index = i + j * padded_filter_inner_dim_size;
    305       // Load filter.
    306       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
    307       // Load input.
    308       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
    309       // Vector multiply-add.
    310       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
    311     }
    312     if (depth_multiplier == 1) {
    313       // Write directly to the output.
    314       Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
    315     } else {
    316       // Buffer output for subsequent reduction step.
    317       Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
    318     }
    319   }
    320 
    321   if (output_scalar_size > 0) {
    322     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
    323     for (int j = 0; j < filter_spatial_size; ++j) {
    324       const int64 index =
    325           output_vectorized_size + j * padded_filter_inner_dim_size;
    326       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
    327       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
    328       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
    329     }
    330     // Load accumulator into an array and loop through output.
    331     T out_buf[kPacketSize];
    332     Eigen::internal::pstoreu<T>(out_buf, vaccum);
    333     if (depth_multiplier == 1) {
    334       // Write directly to the output.
    335       for (int j = 0; j < output_scalar_size; ++j) {
    336         output[base_output_index + output_vectorized_size + j] = out_buf[j];
    337       }
    338     } else {
    339       // Buffer output for subsequent reduction step.
    340       for (int j = 0; j < output_scalar_size; ++j) {
    341         out_buffer[output_vectorized_size + j] = out_buf[j];
    342       }
    343     }
    344   }
    345 
    346   // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
    347   if (depth_multiplier > 1) {
    348     for (int64 d = 0; d < in_depth; ++d) {
    349       const int64 index = d * args.depth_multiplier;
    350       T accum = static_cast<T>(0);
    351       for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
    352         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
    353         accum += Eigen::internal::predux(v);
    354       }
    355       // Copy scalar portion of replicated output.
    356       for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
    357         accum += out_buffer[index + dm_vectorized_size + dm];
    358       }
    359       // Copy to output.
    360       output[base_output_index + d] = accum;
    361     }
    362   }
    363 }
    364 
    365 // Computes the depthwise conv2d backprop input of 'out_backprop' by
    366 // 'depthwise_filter' and stores the result in 'in_backprop'.
    367 template <typename T>
    368 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
    369   typedef typename Eigen::internal::packet_traits<T>::type Packet;
    370 
    371   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
    372                   const T* out_backprop, const T* depthwise_filter,
    373                   T* in_backprop, TensorFormat data_format) {
    374     OP_REQUIRES(
    375         ctx, data_format == FORMAT_NHWC,
    376         errors::Unimplemented(
    377             "Depthwise convolution on CPU is only supported for NHWC format"));
    378 
    379     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    380 
    381     // Pad 'depthwise_filter' to vector register width (if needed).
    382     const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
    383     Tensor padded_filter;
    384     if (pad_filter) {
    385       // Allocate space for padded filter.
    386       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    387       const int64 padded_filter_inner_dim_size =
    388           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
    389       OP_REQUIRES_OK(
    390           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    391                                   TensorShape({filter_spatial_size,
    392                                                padded_filter_inner_dim_size}),
    393                                   &padded_filter));
    394       // Write out padded filter.
    395       functor::DepthwiseFilterPadOp<T>()(
    396           args, depthwise_filter, padded_filter.template flat<T>().data());
    397     }
    398     const T* filter_data =
    399         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
    400 
    401     // Computes one shard of depthwise conv2d backprop input.
    402     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
    403                      int64 start, int64 limit) {
    404       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    405 
    406       const int64 input_image_size =
    407           args.in_rows * args.in_cols * args.in_depth;
    408       const int64 output_image_size =
    409           args.out_rows * args.out_cols * args.out_depth;
    410       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    411       const int64 padded_filter_inner_dim_size =
    412           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
    413 
    414       // Allocate buffer to copy regions from 'out_backprop'.
    415       Tensor out_bprop_buffer;
    416       OP_REQUIRES_OK(
    417           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    418                                   TensorShape({filter_spatial_size,
    419                                                padded_filter_inner_dim_size}),
    420                                   &out_bprop_buffer));
    421       T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
    422 
    423       // Allocate buffer for intermediate results.
    424       Tensor in_bprop_buffer;
    425       OP_REQUIRES_OK(
    426           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    427                                   TensorShape({padded_filter_inner_dim_size}),
    428                                   &in_bprop_buffer));
    429       T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
    430 
    431       for (int64 b = start; b < limit; ++b) {
    432         for (int64 in_r = 0; in_r < args.in_rows; ++in_r) {
    433           for (int64 in_c = 0; in_c < args.in_cols; ++in_c) {
    434             // Populate 'out_bprop_buf' from local 'out_backprop' region.
    435             CopyOutputBackpropRegion<T>(
    436                 args, padded_filter_inner_dim_size, in_r, in_c,
    437                 out_backprop + b * output_image_size, out_bprop_buf);
    438 
    439             // Compute depthwise backprop input.
    440             ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
    441                                     in_c, filter_data, out_bprop_buf,
    442                                     in_bprop_buf,
    443                                     in_backprop + b * input_image_size);
    444           }
    445         }
    446       }
    447     };
    448 
    449     const int64 shard_cost = args.in_rows * args.in_cols * args.out_depth;
    450     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
    451     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
    452           shard_cost, shard);
    453   }
    454 };
    455 
    456 template <typename T>
    457 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
    458                                                 const T* out_backprop,
    459                                                 const T* filter,
    460                                                 T* in_backprop) {
    461   // Naive for loop as a reference point without concerns about performance.
    462   for (int b = 0; b < args.batch; ++b) {
    463     for (int in_r = 0; in_r < args.in_rows; ++in_r) {
    464       for (int in_c = 0; in_c < args.in_cols; ++in_c) {
    465         for (int in_d = 0; in_d < args.in_depth; ++in_d) {
    466           T sum = 0;
    467           const int stride = args.stride;
    468           const int out_d_start = in_d * args.depth_multiplier;
    469           const int out_d_end = out_d_start + args.depth_multiplier;
    470 
    471           for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
    472             const int out_r_start = std::max(
    473                 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
    474             const int out_r_end =
    475                 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
    476 
    477             for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
    478               const int out_c_start = std::max(
    479                   0,
    480                   (in_c - args.filter_cols + args.pad_cols + stride) / stride);
    481               const int out_c_end =
    482                   std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
    483 
    484               for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
    485                 int f_r = in_r + args.pad_rows - out_r * stride;
    486                 int f_c = in_c + args.pad_cols - out_c * stride;
    487                 int filter_dm = out_d - out_d_start;
    488                 int out_backprop_offset =
    489                     out_d +
    490                     args.out_depth *
    491                         (out_c + args.out_cols * (out_r + args.out_rows * b));
    492                 int filter_offset =
    493                     filter_dm +
    494                     args.depth_multiplier *
    495                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
    496                 sum +=
    497                     out_backprop[out_backprop_offset] * filter[filter_offset];
    498               }
    499             }
    500           }
    501 
    502           int in_backprop_offset =
    503               in_d +
    504               args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
    505           in_backprop[in_backprop_offset] = sum;
    506         }
    507       }
    508     }
    509   }
    510 }
    511 
    512 #if GOOGLE_CUDA
    513 
    514 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
    515                                                           Eigen::half>;
    516 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
    517 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
    518 
    519 #endif  // GOOGLE_CUDA
    520 
    521 // Kernel to compute the input backprop for depthwise convolution.
    522 template <typename Device, class T>
    523 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
    524  public:
    525   explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
    526       : OpKernel(context) {
    527     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    528     OP_REQUIRES(context, strides_.size() == 4,
    529                 errors::InvalidArgument("Sliding window strides field must "
    530                                         "specify 4 dimensions"));
    531 
    532     string data_format;
    533     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    534     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    535                 errors::InvalidArgument("Invalid data format"));
    536 
    537     stride_ = GetTensorDim(strides_, data_format_, 'H');
    538     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
    539     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
    540     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
    541 
    542     OP_REQUIRES(context, stride_ == stride_w,
    543                 errors::InvalidArgument(
    544                     "Current implementation only supports equal length "
    545                     "strides in the row and column dimensions."));
    546     OP_REQUIRES(
    547         context, (stride_n == 1 && stride_c == 1),
    548         errors::InvalidArgument("Current implementation does not yet support "
    549                                 "strides in the batch and depth dimensions."));
    550     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    551   }
    552 
    553   void Compute(OpKernelContext* context) override {
    554     const Tensor& input_sizes = context->input(0);
    555     const Tensor& filter = context->input(1);
    556     OP_REQUIRES(
    557         context, TensorShapeUtils::IsVector(input_sizes.shape()),
    558         errors::InvalidArgument(
    559             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
    560             input_sizes.dims()));
    561     TensorShape input_shape;
    562     const int32* in_sizes_data = input_sizes.template flat<int32>().data();
    563     for (int i = 0; i < input_sizes.NumElements(); ++i) {
    564       OP_REQUIRES(context, in_sizes_data[i] >= 0,
    565                   errors::InvalidArgument("Dimension ", i,
    566                                           " of input_sizes must be >= 0"));
    567       input_shape.AddDim(in_sizes_data[i]);
    568     }
    569     const TensorShape& filter_shape = filter.shape();
    570     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
    571     Tensor* in_backprop = nullptr;
    572     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
    573                                 {0}, 0, input_shape, &in_backprop));
    574     auto out_backprop_ptr = out_backprop.template flat<T>().data();
    575     auto filter_ptr = filter.template flat<T>().data();
    576     auto in_backprop_ptr = in_backprop->template flat<T>().data();
    577     // If there is nothing to compute, return.
    578     if (input_shape.num_elements() == 0) {
    579       return;
    580     }
    581     LaunchDepthwiseConvBackpropInputOp<Device, T>()(
    582         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
    583         data_format_);
    584   }
    585 
    586  private:
    587   std::vector<int32> strides_;
    588   Padding padding_;
    589   TensorFormat data_format_;
    590   int64 stride_;
    591 
    592   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
    593 };
    594 
    595 #define REGISTER_CPU_KERNEL(T)                                       \
    596   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
    597                               .Device(DEVICE_CPU)                    \
    598                               .TypeConstraint<T>("T"),               \
    599                           DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
    600 TF_CALL_float(REGISTER_CPU_KERNEL);
    601 TF_CALL_double(REGISTER_CPU_KERNEL);
    602 #undef REGISTER_CPU_KERNEL
    603 
    604 #if GOOGLE_CUDA
    605 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
    606                             .Device(DEVICE_GPU)
    607                             .TypeConstraint<float>("T")
    608                             .HostMemory("input_sizes"),
    609                         DepthwiseConv2dNativeBackpropInputOp<GPUDevice, float>);
    610 
    611 REGISTER_KERNEL_BUILDER(
    612     Name("DepthwiseConv2dNativeBackpropInput")
    613         .Device(DEVICE_GPU)
    614         .TypeConstraint<double>("T")
    615         .HostMemory("input_sizes"),
    616     DepthwiseConv2dNativeBackpropInputOp<GPUDevice, double>);
    617 #endif  // GOOGLE_CUDA
    618 
    619 // Kernels to compute the gradients of the filters for depthwise convolution.
    620 
    621 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
    622 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
    623 //
    624 // EX:
    625 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
    626 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
    627 //
    628 //   'input_buffer' [rows, cols, in_depth, depth_multiplier]
    629 //
    630 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
    631 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
    632 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
    633 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
    634 //
    635 //   'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
    636 //
    637 //     [q00, q01, q10, q11] [q20, q21, r00, r01]
    638 //     [r10, r11, r20, r21] [s00, s01, s10, s11]
    639 //     [s20, s21, t00, t01] [t10, t11, t20, a21]
    640 //
    641 //   First output register of 'filter_backprop'
    642 //     [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
    643 //
    644 template <typename T>
    645 static void ComputeBackpropFilter(const DepthwiseArgs& args,
    646                                   const int64 padded_out_depth_size,
    647                                   const int64 out_r, const int64 out_c,
    648                                   const T* out_backprop, const T* input_buffer,
    649                                   T* output_buffer) {
    650   typedef typename Eigen::internal::packet_traits<T>::type Packet;
    651   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    652   // Calculate vectorized size of 'padded_out_depth_size'.
    653   const int64 out_depth = args.out_depth;
    654   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    655   const int64 output_vectorized_size =
    656       (padded_out_depth_size / kPacketSize) * kPacketSize;
    657   const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth;
    658   // Determine whether we can execute fast or slow code path.
    659   const int64 output_image_size =
    660       args.out_rows * args.out_cols * args.out_depth;
    661   const int64 output_last_vector_index =
    662       output_image_size - (filter_spatial_size * padded_out_depth_size);
    663   const bool fast_path = base_output_index <= output_last_vector_index;
    664 
    665   if (fast_path) {
    666     // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
    667     // amortize the cost of 'output_buffer' load store in the loop below.
    668     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
    669       // Load vector register from 'out_backprop'.
    670       const auto out_bprop_block =
    671           Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
    672       for (int j = 0; j < filter_spatial_size; ++j) {
    673         const int64 index = i + j * padded_out_depth_size;
    674         // Load vector register from 'input_buffer'.
    675         const auto input_block =
    676             Eigen::internal::ploadu<Packet>(input_buffer + index);
    677         // Load output block into vector register.
    678         auto out_block_data = output_buffer + index;
    679         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
    680         // Vector multiply-add.
    681         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
    682                                                    out_block);
    683         // Store 'out_block' back to memory.
    684         Eigen::internal::pstoreu<T>(out_block_data, out_block);
    685       }
    686     }
    687   } else {
    688     // Slow path (cant do vector reads from non-padded 'out_backprop'.
    689     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
    690       // Calculate safe read size from 'out_backprop'.
    691       const int64 out_bprop_index = base_output_index + i;
    692       const int64 out_bprop_limit =
    693           std::min(output_image_size, out_bprop_index + kPacketSize);
    694       T out_buf[kPacketSize];
    695       memset(&out_buf, 0, kPacketSize * sizeof(T));
    696       const int64 scalar_size = out_bprop_limit - out_bprop_index;
    697       for (int64 j = 0; j < scalar_size; ++j) {
    698         out_buf[j] = out_backprop[out_bprop_index + j];
    699       }
    700       // Load vector register from 'out_buf'.
    701       const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
    702       for (int j = 0; j < filter_spatial_size; ++j) {
    703         const int64 index = i + j * padded_out_depth_size;
    704         // Load vector register from 'input_buffer'.
    705         const auto input_block =
    706             Eigen::internal::ploadu<Packet>(input_buffer + index);
    707         // Load output block into vector register.
    708         auto out_block_data = output_buffer + index;
    709         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
    710         // Vector multiply-add.
    711         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
    712                                                    out_block);
    713         // Store 'out_block' back to memory.
    714         Eigen::internal::pstoreu<T>(out_block_data, out_block);
    715       }
    716     }
    717   }
    718 }
    719 
    720 template <typename Device, typename T>
    721 struct LaunchDepthwiseConvBackpropFilterOp;
    722 
    723 template <typename T>
    724 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
    725   typedef typename Eigen::internal::packet_traits<T>::type Packet;
    726 
    727   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
    728                   const T* out_backprop, const T* input, T* filter_backprop,
    729                   TensorFormat data_format) {
    730     OP_REQUIRES(
    731         ctx, data_format == FORMAT_NHWC,
    732         errors::Unimplemented(
    733             "Depthwise convolution on CPU is only supported for NHWC format"));
    734 
    735     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    736 
    737     const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    738     const int64 padded_out_depth_size =
    739         ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
    740 
    741     // Allocate output buffers for each image in 'batch' (padded to vector
    742     // register boundaries).
    743     Tensor output_buffer;
    744     OP_REQUIRES_OK(
    745         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    746                                 TensorShape({args.batch, filter_spatial_size,
    747                                              padded_out_depth_size}),
    748                                 &output_buffer));
    749     T* output_buffer_data = output_buffer.template flat<T>().data();
    750 
    751     // Computes one shard of depthwise conv2d backprop filter.
    752     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
    753                      int64 start, int64 limit) {
    754       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
    755       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
    756       const int64 padded_out_depth_size =
    757           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
    758 
    759       // Allocate buffer for local input regions.
    760       Tensor input_buffer;
    761       OP_REQUIRES_OK(
    762           ctx, ctx->allocate_temp(
    763                    DataTypeToEnum<T>::value,
    764                    TensorShape({filter_spatial_size, padded_out_depth_size}),
    765                    &input_buffer));
    766       T* input_buffer_data = input_buffer.template flat<T>().data();
    767 
    768       const int64 input_image_size =
    769           args.in_rows * args.in_cols * args.in_depth;
    770       const int64 output_image_size =
    771           args.out_rows * args.out_cols * args.out_depth;
    772       const int64 padded_filter_size =
    773           filter_spatial_size * padded_out_depth_size;
    774 
    775       for (int b = start; b < limit; ++b) {
    776         // Initialize 'output_buffer' for 'b'.
    777         auto* output_buffer = output_buffer_data + b * padded_filter_size;
    778         memset(output_buffer, 0, padded_filter_size * sizeof(T));
    779 
    780         for (int out_r = 0; out_r < args.out_rows; ++out_r) {
    781           for (int out_c = 0; out_c < args.out_cols; ++out_c) {
    782             // Populate 'input_buffer_data' with data from local input region.
    783             functor::DepthwiseInputCopyOp<T>()(
    784                 args, padded_out_depth_size, out_r, out_c,
    785                 input + b * input_image_size, input_buffer_data);
    786             // Compute depthwise backprop filter.
    787             ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
    788                                   out_backprop + b * output_image_size,
    789                                   input_buffer_data, output_buffer);
    790           }
    791         }
    792       }
    793     };
    794     const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth;
    795     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
    796     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
    797           shard_cost, shard);
    798 
    799     // Accumulate 'output_buffer' from each shard into 'output'.
    800     const int64 out_depth = args.out_depth;
    801     const int64 vectorized_size = (out_depth / kPacketSize) * kPacketSize;
    802     const int64 scalar_size = out_depth - vectorized_size;
    803     const int64 padded_filter_size =
    804         filter_spatial_size * padded_out_depth_size;
    805     memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
    806 
    807     for (int64 i = 0; i < filter_spatial_size; ++i) {
    808       const int64 buffer_base = i * padded_out_depth_size;
    809       const int64 output_base = i * out_depth;
    810       // Write vectorized length of filter's inner dimension to output.
    811       for (int64 j = 0; j < vectorized_size; j += kPacketSize) {
    812         // Load data from 'filter_backprop' into vector register.
    813         auto out_block_data = filter_backprop + output_base + j;
    814         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
    815         for (int b = 0; b < args.batch; ++b) {
    816           // Load data from 'output_buffer' for 'b'.
    817           const auto* output_buffer =
    818               output_buffer_data + b * padded_filter_size;
    819           const auto v =
    820               Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
    821           // Add 'v' to 'out_block'.
    822           out_block = Eigen::internal::padd<Packet>(out_block, v);
    823         }
    824         // Store 'out_block' back to memory.
    825         Eigen::internal::pstoreu<T>(out_block_data, out_block);
    826       }
    827       // Write scalar length of filter's inner dimension to output.
    828       for (int64 j = 0; j < scalar_size; ++j) {
    829         for (int b = 0; b < args.batch; ++b) {
    830           const auto* output_buffer =
    831               output_buffer_data + b * padded_filter_size;
    832           filter_backprop[output_base + vectorized_size + j] +=
    833               output_buffer[buffer_base + vectorized_size + j];
    834         }
    835       }
    836     }
    837   }
    838 };
    839 
    840 template <typename T>
    841 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
    842                                                  const T* out_backprop,
    843                                                  const T* input,
    844                                                  T* filter_backprop) {
    845   int num_filter_backprop = args.filter_rows * args.filter_cols *
    846                             args.in_depth * args.depth_multiplier;
    847   memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
    848   // Naive for loop as a reference point without concerns about performance.
    849   for (int b = 0; b < args.batch; ++b) {
    850     for (int out_r = 0; out_r < args.out_rows; ++out_r) {
    851       for (int out_c = 0; out_c < args.out_cols; ++out_c) {
    852         for (int out_d = 0; out_d < args.out_depth; ++out_d) {
    853           const int in_d = out_d / args.depth_multiplier;
    854           const int dm = out_d % args.depth_multiplier;
    855           const int in_r_start = out_r * args.stride - args.pad_rows;
    856           const int in_c_start = out_c * args.stride - args.pad_cols;
    857 
    858           for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
    859             for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
    860               const int in_r = in_r_start + f_r;
    861               const int in_c = in_c_start + f_c;
    862 
    863               if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
    864                   in_c < args.in_cols) {
    865                 int out_backprop_offset =
    866                     out_d +
    867                     args.out_depth *
    868                         (out_c + args.out_cols * (out_r + args.out_rows * b));
    869                 int input_offset =
    870                     in_d +
    871                     args.in_depth *
    872                         (in_c + args.in_cols * (in_r + args.in_rows * b));
    873                 int filter_backprop_offset =
    874                     dm +
    875                     args.depth_multiplier *
    876                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
    877                 filter_backprop[filter_backprop_offset] +=
    878                     input[input_offset] * out_backprop[out_backprop_offset];
    879               }
    880             }
    881           }
    882         }
    883       }
    884     }
    885   }
    886 }
    887 
    888 #if GOOGLE_CUDA
    889 
    890 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
    891                                                            Eigen::half>;
    892 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
    893 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
    894 
    895 #endif  // GOOGLE_CUDA
    896 
    897 // Kernel to compute the filter backprop for depthwise convolution.
    898 template <typename Device, class T>
    899 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
    900  public:
    901   explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
    902       : OpKernel(context) {
    903     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    904     OP_REQUIRES(context, strides_.size() == 4,
    905                 errors::InvalidArgument("Sliding window strides field must "
    906                                         "specify 4 dimensions"));
    907 
    908     string data_format;
    909     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
    910     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
    911                 errors::InvalidArgument("Invalid data format"));
    912 
    913     stride_ = GetTensorDim(strides_, data_format_, 'H');
    914     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
    915     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
    916     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
    917 
    918     OP_REQUIRES(context, stride_ == stride_w,
    919                 errors::InvalidArgument(
    920                     "Current implementation only supports equal length "
    921                     "strides in the row and column dimensions."));
    922     OP_REQUIRES(
    923         context, (stride_n == 1 && stride_c == 1),
    924         errors::InvalidArgument("Current implementation does not yet support "
    925                                 "strides in the batch and depth dimensions."));
    926     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
    927   }
    928 
    929   void Compute(OpKernelContext* context) override {
    930     const Tensor& input = context->input(0);
    931     const Tensor& filter_sizes = context->input(1);
    932     OP_REQUIRES(
    933         context, TensorShapeUtils::IsVector(filter_sizes.shape()),
    934         errors::InvalidArgument(
    935             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
    936             filter_sizes.dims()));
    937     TensorShape filter_shape;
    938     const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
    939     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
    940       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
    941                   errors::InvalidArgument("Dimension ", i,
    942                                           " of filter_sizes must be >= 0"));
    943       filter_shape.AddDim(filter_sizes_data[i]);
    944     }
    945     const TensorShape& input_shape = input.shape();
    946 
    947     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
    948     Tensor* filter_backprop = nullptr;
    949     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
    950                                 {1}, 0, filter_shape, &filter_backprop));
    951 
    952     auto out_backprop_ptr = out_backprop.template flat<T>().data();
    953     auto input_ptr = input.template flat<T>().data();
    954     auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
    955     // If there is nothing to compute, return.
    956     if (filter_shape.num_elements() == 0) {
    957       return;
    958     }
    959     LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
    960         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
    961         data_format_);
    962   }
    963 
    964  private:
    965   std::vector<int32> strides_;
    966   Padding padding_;
    967   TensorFormat data_format_;
    968   int64 stride_;
    969 
    970   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
    971 };
    972 
    973 #define REGISTER_CPU_KERNEL(T)                    \
    974   REGISTER_KERNEL_BUILDER(                        \
    975       Name("DepthwiseConv2dNativeBackpropFilter") \
    976           .Device(DEVICE_CPU)                     \
    977           .TypeConstraint<T>("T"),                \
    978       DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
    979 TF_CALL_float(REGISTER_CPU_KERNEL);
    980 TF_CALL_double(REGISTER_CPU_KERNEL);
    981 #undef REGISTER_CPU_KERNEL
    982 
    983 #if GOOGLE_CUDA
    984 REGISTER_KERNEL_BUILDER(
    985     Name("DepthwiseConv2dNativeBackpropFilter")
    986         .Device(DEVICE_GPU)
    987         .TypeConstraint<float>("T")
    988         .HostMemory("filter_sizes"),
    989     DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, float>);
    990 
    991 REGISTER_KERNEL_BUILDER(
    992     Name("DepthwiseConv2dNativeBackpropFilter")
    993         .Device(DEVICE_GPU)
    994         .TypeConstraint<double>("T")
    995         .HostMemory("filter_sizes"),
    996     DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, double>);
    997 #endif  // GOOGLE_CUDA
    998 
    999 }  // namespace tensorflow
   1000