1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #define USE_EIGEN_TENSOR 17 #define EIGEN_USE_THREADS 18 19 #include <array> 20 21 #include "tensorflow/core/framework/register_types.h" 22 #include "tensorflow/core/kernels/conv_2d.h" 23 #include "tensorflow/core/kernels/conv_3d.h" 24 #include "tensorflow/core/kernels/conv_ops_gpu.h" 25 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h" 26 27 typedef Eigen::GpuDevice GPUDevice; 28 29 namespace tensorflow { 30 31 #if GOOGLE_CUDA 32 33 template <typename T> 34 void DnnPooling3dOp<T>::Compute( 35 OpKernelContext* context, 36 perftools::gputools::dnn::PoolingMode pooling_mode, 37 const std::array<int64, 3>& window, const std::array<int64, 3>& stride, 38 const std::array<int64, 3>& padding, TensorFormat data_format, 39 const Tensor& tensor_in, Tensor* output) { 40 const auto in_shape = tensor_in.shape(); 41 const auto out_shape = output->shape(); 42 43 const int64 in_batch = GetTensorDim(tensor_in, data_format, 'N'); 44 const int64 in_features = GetTensorDim(tensor_in, data_format, 'C'); 45 46 Tensor transformed_input; 47 if (data_format == FORMAT_NHWC) { 48 OP_REQUIRES_OK(context, context->allocate_temp( 49 DataTypeToEnum<T>::value, 50 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(), 51 data_format), 52 &transformed_input)); 53 functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(), 54 tensor_in.tensor<T, 5>(), 55 transformed_input.tensor<T, 5>()); 56 } else { 57 transformed_input = tensor_in; 58 } 59 Tensor transformed_output; 60 if (data_format == FORMAT_NHWC) { 61 OP_REQUIRES_OK(context, 62 context->allocate_temp( 63 DataTypeToEnum<T>::value, 64 ShapeFromFormat(FORMAT_NCHW, out_shape, data_format), 65 &transformed_output)); 66 } else { 67 transformed_output = *output; 68 } 69 70 perftools::gputools::dnn::PoolingDescriptor pooling_desc(3); 71 pooling_desc.set_pooling_mode(pooling_mode); 72 perftools::gputools::dnn::BatchDescriptor input_desc(3); 73 input_desc.set_count(in_batch) 74 .set_feature_map_count(in_features) 75 .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); 76 perftools::gputools::dnn::BatchDescriptor output_desc(3); 77 output_desc.set_count(in_batch) 78 .set_feature_map_count(in_features) 79 .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); 80 for (size_t i = 0; i < window.size(); ++i) { 81 const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i); 82 pooling_desc.set_window(dim_i, window[i]); 83 pooling_desc.set_stride(dim_i, stride[i]); 84 pooling_desc.set_padding(dim_i, padding[i]); 85 input_desc.set_spatial_dim(dim_i, 86 GetTensorDim(tensor_in, data_format, '2' - i)); 87 output_desc.set_spatial_dim(dim_i, 88 GetTensorDim(out_shape, data_format, '2' - i)); 89 } 90 91 auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(), 92 transformed_input.template flat<T>().size()); 93 auto output_data = 94 AsDeviceMemory(transformed_output.template flat<T>().data(), 95 transformed_output.template flat<T>().size()); 96 97 auto* stream = context->op_device_context()->stream(); 98 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); 99 100 bool status = stream 101 ->ThenPoolForward(pooling_desc, input_desc, input_data, 102 output_desc, &output_data) 103 .ok(); 104 OP_REQUIRES(context, status, 105 errors::Internal("cudnn PoolForward launch failed")); 106 107 if (data_format == FORMAT_NHWC) { 108 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; 109 functor::NCHWToNHWC<GPUDevice, T, 5>()( 110 context->eigen_device<GPUDevice>(), 111 toConstTensor(transformed_output).template tensor<T, 5>(), 112 output->tensor<T, 5>()); 113 } 114 } 115 116 template <typename T> 117 void DnnPooling3dGradOp<T>::Compute( 118 OpKernelContext* context, 119 perftools::gputools::dnn::PoolingMode pooling_mode, 120 const std::array<int64, 3>& window, const std::array<int64, 3>& stride, 121 const std::array<int64, 3>& padding, 122 const std::array<int64, 3>& output_size, TensorFormat data_format, 123 const Tensor& out_backprop, const TensorShape& tensor_in_shape, 124 const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) { 125 CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) || 126 (tensor_in && tensor_out)) 127 << "For MaxPoolGrad, both tensor_in and tensor_out needs to be " 128 "specified"; 129 130 const int64 in_batch = GetTensorDim(tensor_in_shape, data_format, 'N'); 131 const int64 in_features = GetTensorDim(tensor_in_shape, data_format, 'C'); 132 133 Tensor transformed_input; 134 TensorShape transformed_input_shape; 135 if (data_format == FORMAT_NHWC || tensor_in == nullptr) { 136 transformed_input_shape = 137 ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format); 138 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value, 139 transformed_input_shape, 140 &transformed_input)); 141 } else { 142 transformed_input = *tensor_in; 143 } 144 Tensor transformed_output; 145 TensorShape transformed_output_shape; 146 if (data_format == FORMAT_NHWC || tensor_out == nullptr) { 147 transformed_output_shape = 148 ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format); 149 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value, 150 transformed_output_shape, 151 &transformed_output)); 152 } else { 153 transformed_output = *tensor_out; 154 } 155 Tensor transformed_input_backprop; 156 if (data_format == FORMAT_NHWC) { 157 OP_REQUIRES_OK(context, 158 context->allocate_temp(DataTypeToEnum<T>::value, 159 transformed_input_shape, 160 &transformed_input_backprop)); 161 } else { 162 transformed_input_backprop = *input_backprop; 163 } 164 Tensor transformed_output_backprop; 165 if (data_format == FORMAT_NHWC) { 166 OP_REQUIRES_OK(context, 167 context->allocate_temp(DataTypeToEnum<T>::value, 168 transformed_output_shape, 169 &transformed_output_backprop)); 170 } else { 171 transformed_output_backprop = out_backprop; 172 } 173 if (data_format == FORMAT_NHWC) { 174 if (tensor_in != nullptr) { 175 functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(), 176 tensor_in->tensor<T, 5>(), 177 transformed_input.tensor<T, 5>()); 178 } 179 if (tensor_out != nullptr) { 180 functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(), 181 tensor_out->tensor<T, 5>(), 182 transformed_output.tensor<T, 5>()); 183 } 184 functor::NHWCToNCHW<GPUDevice, T, 5>()( 185 context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(), 186 transformed_output_backprop.tensor<T, 5>()); 187 } 188 189 perftools::gputools::dnn::PoolingDescriptor pooling_desc(3); 190 pooling_desc.set_pooling_mode(pooling_mode); 191 192 perftools::gputools::dnn::BatchDescriptor orig_output_desc(3); 193 orig_output_desc.set_count(in_batch) 194 .set_feature_map_count(in_features) 195 .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); 196 197 perftools::gputools::dnn::BatchDescriptor orig_input_desc(3); 198 orig_input_desc.set_count(in_batch) 199 .set_feature_map_count(in_features) 200 .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); 201 202 for (size_t i = 0; i < window.size(); ++i) { 203 const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i); 204 pooling_desc.set_window(dim_i, window[i]); 205 pooling_desc.set_stride(dim_i, stride[i]); 206 pooling_desc.set_padding(dim_i, padding[i]); 207 orig_input_desc.set_spatial_dim( 208 dim_i, GetTensorDim(tensor_in_shape, data_format, '2' - i)); 209 orig_output_desc.set_spatial_dim(dim_i, output_size[i]); 210 } 211 212 auto orig_output_data = 213 AsDeviceMemory(transformed_output.template flat<T>().data(), 214 transformed_output.template flat<T>().size()); 215 auto orig_input_data = 216 AsDeviceMemory(transformed_input.template flat<T>().data(), 217 transformed_input.template flat<T>().size()); 218 auto output_backprop_data = 219 AsDeviceMemory(transformed_output_backprop.template flat<T>().data(), 220 transformed_output_backprop.template flat<T>().size()); 221 auto input_backprop_data = 222 AsDeviceMemory(transformed_input_backprop.template flat<T>().data(), 223 transformed_input_backprop.template flat<T>().size()); 224 225 auto* stream = context->op_device_context()->stream(); 226 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); 227 228 bool status = 229 stream 230 ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data, 231 orig_output_desc, orig_output_data, 232 output_backprop_data, &input_backprop_data) 233 .ok(); 234 OP_REQUIRES(context, status, 235 errors::Internal("cudnn PoolBackward launch failed")); 236 237 if (data_format == FORMAT_NHWC) { 238 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; 239 functor::NCHWToNHWC<GPUDevice, T, 5>()( 240 context->eigen_device<GPUDevice>(), 241 toConstTensor(transformed_input_backprop).template tensor<T, 5>(), 242 input_backprop->tensor<T, 5>()); 243 } 244 } 245 246 #define DEFINE_DNN_OPS(T) \ 247 template class DnnPooling3dOp<T>; \ 248 template class DnnPooling3dGradOp<T>; 249 TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS) 250 #undef DEFINE_DNN_OPS 251 252 #endif // GOOGLE_CUDA 253 254 } // namespace tensorflow 255