Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #define USE_EIGEN_TENSOR
     17 #define EIGEN_USE_THREADS
     18 
     19 #include <array>
     20 
     21 #include "tensorflow/core/framework/register_types.h"
     22 #include "tensorflow/core/kernels/conv_2d.h"
     23 #include "tensorflow/core/kernels/conv_3d.h"
     24 #include "tensorflow/core/kernels/conv_ops_gpu.h"
     25 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
     26 
     27 typedef Eigen::GpuDevice GPUDevice;
     28 
     29 namespace tensorflow {
     30 
     31 #if GOOGLE_CUDA
     32 
     33 template <typename T>
     34 void DnnPooling3dOp<T>::Compute(
     35     OpKernelContext* context,
     36     perftools::gputools::dnn::PoolingMode pooling_mode,
     37     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
     38     const std::array<int64, 3>& padding, TensorFormat data_format,
     39     const Tensor& tensor_in, Tensor* output) {
     40   const auto in_shape = tensor_in.shape();
     41   const auto out_shape = output->shape();
     42 
     43   const int64 in_batch = GetTensorDim(tensor_in, data_format, 'N');
     44   const int64 in_features = GetTensorDim(tensor_in, data_format, 'C');
     45 
     46   Tensor transformed_input;
     47   if (data_format == FORMAT_NHWC) {
     48     OP_REQUIRES_OK(context, context->allocate_temp(
     49                                 DataTypeToEnum<T>::value,
     50                                 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
     51                                                 data_format),
     52                                 &transformed_input));
     53     functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
     54                                            tensor_in.tensor<T, 5>(),
     55                                            transformed_input.tensor<T, 5>());
     56   } else {
     57     transformed_input = tensor_in;
     58   }
     59   Tensor transformed_output;
     60   if (data_format == FORMAT_NHWC) {
     61     OP_REQUIRES_OK(context,
     62                    context->allocate_temp(
     63                        DataTypeToEnum<T>::value,
     64                        ShapeFromFormat(FORMAT_NCHW, out_shape, data_format),
     65                        &transformed_output));
     66   } else {
     67     transformed_output = *output;
     68   }
     69 
     70   perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
     71   pooling_desc.set_pooling_mode(pooling_mode);
     72   perftools::gputools::dnn::BatchDescriptor input_desc(3);
     73   input_desc.set_count(in_batch)
     74       .set_feature_map_count(in_features)
     75       .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     76   perftools::gputools::dnn::BatchDescriptor output_desc(3);
     77   output_desc.set_count(in_batch)
     78       .set_feature_map_count(in_features)
     79       .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     80   for (size_t i = 0; i < window.size(); ++i) {
     81     const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i);
     82     pooling_desc.set_window(dim_i, window[i]);
     83     pooling_desc.set_stride(dim_i, stride[i]);
     84     pooling_desc.set_padding(dim_i, padding[i]);
     85     input_desc.set_spatial_dim(dim_i,
     86                                GetTensorDim(tensor_in, data_format, '2' - i));
     87     output_desc.set_spatial_dim(dim_i,
     88                                 GetTensorDim(out_shape, data_format, '2' - i));
     89   }
     90 
     91   auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
     92                                    transformed_input.template flat<T>().size());
     93   auto output_data =
     94       AsDeviceMemory(transformed_output.template flat<T>().data(),
     95                      transformed_output.template flat<T>().size());
     96 
     97   auto* stream = context->op_device_context()->stream();
     98   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
     99 
    100   bool status = stream
    101                     ->ThenPoolForward(pooling_desc, input_desc, input_data,
    102                                       output_desc, &output_data)
    103                     .ok();
    104   OP_REQUIRES(context, status,
    105               errors::Internal("cudnn PoolForward launch failed"));
    106 
    107   if (data_format == FORMAT_NHWC) {
    108     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
    109     functor::NCHWToNHWC<GPUDevice, T, 5>()(
    110         context->eigen_device<GPUDevice>(),
    111         toConstTensor(transformed_output).template tensor<T, 5>(),
    112         output->tensor<T, 5>());
    113   }
    114 }
    115 
    116 template <typename T>
    117 void DnnPooling3dGradOp<T>::Compute(
    118     OpKernelContext* context,
    119     perftools::gputools::dnn::PoolingMode pooling_mode,
    120     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
    121     const std::array<int64, 3>& padding,
    122     const std::array<int64, 3>& output_size, TensorFormat data_format,
    123     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
    124     const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
    125   CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
    126         (tensor_in && tensor_out))
    127       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
    128          "specified";
    129 
    130   const int64 in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
    131   const int64 in_features = GetTensorDim(tensor_in_shape, data_format, 'C');
    132 
    133   Tensor transformed_input;
    134   TensorShape transformed_input_shape;
    135   if (data_format == FORMAT_NHWC || tensor_in == nullptr) {
    136     transformed_input_shape =
    137         ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
    138     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
    139                                                    transformed_input_shape,
    140                                                    &transformed_input));
    141   } else {
    142     transformed_input = *tensor_in;
    143   }
    144   Tensor transformed_output;
    145   TensorShape transformed_output_shape;
    146   if (data_format == FORMAT_NHWC || tensor_out == nullptr) {
    147     transformed_output_shape =
    148         ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
    149     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
    150                                                    transformed_output_shape,
    151                                                    &transformed_output));
    152   } else {
    153     transformed_output = *tensor_out;
    154   }
    155   Tensor transformed_input_backprop;
    156   if (data_format == FORMAT_NHWC) {
    157     OP_REQUIRES_OK(context,
    158                    context->allocate_temp(DataTypeToEnum<T>::value,
    159                                           transformed_input_shape,
    160                                           &transformed_input_backprop));
    161   } else {
    162     transformed_input_backprop = *input_backprop;
    163   }
    164   Tensor transformed_output_backprop;
    165   if (data_format == FORMAT_NHWC) {
    166     OP_REQUIRES_OK(context,
    167                    context->allocate_temp(DataTypeToEnum<T>::value,
    168                                           transformed_output_shape,
    169                                           &transformed_output_backprop));
    170   } else {
    171     transformed_output_backprop = out_backprop;
    172   }
    173   if (data_format == FORMAT_NHWC) {
    174     if (tensor_in != nullptr) {
    175       functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
    176                                              tensor_in->tensor<T, 5>(),
    177                                              transformed_input.tensor<T, 5>());
    178     }
    179     if (tensor_out != nullptr) {
    180       functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
    181                                              tensor_out->tensor<T, 5>(),
    182                                              transformed_output.tensor<T, 5>());
    183     }
    184     functor::NHWCToNCHW<GPUDevice, T, 5>()(
    185         context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
    186         transformed_output_backprop.tensor<T, 5>());
    187   }
    188 
    189   perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
    190   pooling_desc.set_pooling_mode(pooling_mode);
    191 
    192   perftools::gputools::dnn::BatchDescriptor orig_output_desc(3);
    193   orig_output_desc.set_count(in_batch)
    194       .set_feature_map_count(in_features)
    195       .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
    196 
    197   perftools::gputools::dnn::BatchDescriptor orig_input_desc(3);
    198   orig_input_desc.set_count(in_batch)
    199       .set_feature_map_count(in_features)
    200       .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
    201 
    202   for (size_t i = 0; i < window.size(); ++i) {
    203     const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i);
    204     pooling_desc.set_window(dim_i, window[i]);
    205     pooling_desc.set_stride(dim_i, stride[i]);
    206     pooling_desc.set_padding(dim_i, padding[i]);
    207     orig_input_desc.set_spatial_dim(
    208         dim_i, GetTensorDim(tensor_in_shape, data_format, '2' - i));
    209     orig_output_desc.set_spatial_dim(dim_i, output_size[i]);
    210   }
    211 
    212   auto orig_output_data =
    213       AsDeviceMemory(transformed_output.template flat<T>().data(),
    214                      transformed_output.template flat<T>().size());
    215   auto orig_input_data =
    216       AsDeviceMemory(transformed_input.template flat<T>().data(),
    217                      transformed_input.template flat<T>().size());
    218   auto output_backprop_data =
    219       AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
    220                      transformed_output_backprop.template flat<T>().size());
    221   auto input_backprop_data =
    222       AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
    223                      transformed_input_backprop.template flat<T>().size());
    224 
    225   auto* stream = context->op_device_context()->stream();
    226   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
    227 
    228   bool status =
    229       stream
    230           ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
    231                              orig_output_desc, orig_output_data,
    232                              output_backprop_data, &input_backprop_data)
    233           .ok();
    234   OP_REQUIRES(context, status,
    235               errors::Internal("cudnn PoolBackward launch failed"));
    236 
    237   if (data_format == FORMAT_NHWC) {
    238     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
    239     functor::NCHWToNHWC<GPUDevice, T, 5>()(
    240         context->eigen_device<GPUDevice>(),
    241         toConstTensor(transformed_input_backprop).template tensor<T, 5>(),
    242         input_backprop->tensor<T, 5>());
    243   }
    244 }
    245 
    246 #define DEFINE_DNN_OPS(T)           \
    247   template class DnnPooling3dOp<T>; \
    248   template class DnnPooling3dGradOp<T>;
    249 TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
    250 #undef DEFINE_DNN_OPS
    251 
    252 #endif  // GOOGLE_CUDA
    253 
    254 }  // namespace tensorflow
    255