1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/array_ops.cc. 17 18 #define EIGEN_USE_THREADS 19 20 #include <memory> 21 #include <string> 22 #include <utility> 23 24 #include "tensorflow/core/kernels/spacetodepth_op.h" 25 26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 27 #include "tensorflow/core/framework/op.h" 28 #include "tensorflow/core/framework/op_kernel.h" 29 #include "tensorflow/core/framework/register_types.h" 30 #include "tensorflow/core/framework/tensor.h" 31 #include "tensorflow/core/framework/tensor_shape.h" 32 #include "tensorflow/core/framework/tensor_types.h" 33 #include "tensorflow/core/framework/types.h" 34 #include "tensorflow/core/platform/logging.h" 35 #include "tensorflow/core/platform/types.h" 36 #include "tensorflow/core/util/tensor_format.h" 37 38 namespace tensorflow { 39 40 typedef Eigen::ThreadPoolDevice CPUDevice; 41 typedef Eigen::GpuDevice GPUDevice; 42 43 template <typename Device, typename T> 44 class SpaceToDepthOp : public OpKernel { 45 public: 46 explicit SpaceToDepthOp(OpKernelConstruction* context) : OpKernel(context) { 47 string data_format_str; 48 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str)); 49 OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_), 50 errors::InvalidArgument("Invalid data format")); 51 52 OP_REQUIRES_OK(context, context->GetAttr("block_size", &block_size_)); 53 OP_REQUIRES(context, block_size_ > 1, 54 errors::InvalidArgument("Block size should be > 1, but was: ", 55 block_size_)); 56 57 if (std::is_same<Device, CPUDevice>::value) { 58 OP_REQUIRES( 59 context, data_format_ == FORMAT_NHWC, 60 errors::InvalidArgument( 61 "Only NHWC data_format supported on CPU. Got ", data_format_str)); 62 } 63 } 64 65 void Compute(OpKernelContext* context) override { 66 const Tensor& input = context->input(0); 67 const int dims = input.dims(); 68 69 // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here. 70 constexpr bool is_int8x4 = std::is_same<T, qint8>::value; 71 OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)), 72 errors::InvalidArgument( 73 "qint8 should be used with data_format NCHW_VECT_C.")); 74 75 constexpr int kVect = is_int8x4 ? 4 : 1; 76 constexpr int kDims = is_int8x4 ? 5 : 4; 77 OP_REQUIRES(context, kDims == dims, 78 errors::InvalidArgument("Input rank should be: ", kDims, 79 " instead of: ", dims)); 80 81 constexpr int kNumSpatialDims = 2; 82 const int batch_size = 83 input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'N')); 84 const int height = 85 input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'H')); 86 const int width = 87 input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'W')); 88 const int input_depth = 89 input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'C')) * 90 kVect; 91 92 // Both width and height must be divisible by block_size. 93 OP_REQUIRES(context, 94 (width % block_size_) == 0 && (height % block_size_) == 0, 95 errors::InvalidArgument( 96 "Image width ", width, " and height ", height, 97 " should be divisible by block_size: ", block_size_)); 98 99 // The 'spatial' block of size block_size_ X block_size_ will be moved 100 // to depth. 101 const int output_depth = input_depth * block_size_ * block_size_; 102 const int output_width = width / block_size_; 103 const int output_height = height / block_size_; 104 105 // Allocate output tensor. 106 Tensor* outputs_tensor = nullptr; 107 OP_REQUIRES_OK(context, 108 context->allocate_output( 109 0, 110 ShapeFromFormat(data_format_, batch_size, output_height, 111 output_width, output_depth), 112 &outputs_tensor)); 113 114 auto Tinput = input.tensor<T, kDims>(); 115 auto Toutput = outputs_tensor->tensor<T, kDims>(); 116 117 if (std::is_same<Device, GPUDevice>::value) { 118 if (is_int8x4) { 119 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32. 120 auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>(); 121 auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>(); 122 functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor; 123 functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_, 124 Toutput_v); 125 return; 126 } else if (data_format_ == FORMAT_NCHW) { 127 functor::SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> functor; 128 functor(context->eigen_device<GPUDevice>(), Tinput, block_size_, 129 Toutput); 130 return; 131 } 132 } 133 134 // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected 135 // (CPU && data_format_ != FORMAT_NHWC) in the constructor. 136 137 if (!is_int8x4) { 138 functor::SpaceToDepthOpFunctor<Device, T, FORMAT_NHWC> functor; 139 functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput); 140 } 141 }; 142 143 private: 144 int block_size_; 145 TensorFormat data_format_; 146 }; 147 148 // Partial specialization of SpaceToDepthOpFunctor for a CPUDevice. 149 namespace functor { 150 template <typename T> 151 struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NHWC> { 152 void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input, 153 int block_size, typename TTypes<T, 4>::Tensor output) { 154 const int batch_size = output.dimension(0); 155 const int input_height = input.dimension(1); 156 const int input_width = input.dimension(2); 157 const int input_depth = input.dimension(3); 158 159 for (int b = 0; b < batch_size; ++b) { 160 for (int h = 0; h < input_height; ++h) { 161 const int out_h = h / block_size; 162 const int offset_h = (h % block_size); 163 for (int w = 0; w < input_width; ++w) { 164 const int out_w = w / block_size; 165 const int offset_w = (w % block_size); 166 const int offset_d = (offset_h * block_size + offset_w) * input_depth; 167 for (int d = 0; d < input_depth; ++d) { 168 const int out_d = d + offset_d; 169 output(b, out_h, out_w, out_d) = input(b, h, w, d); 170 } 171 } 172 } 173 } 174 } 175 }; 176 } // namespace functor 177 178 #define REGISTER(type) \ 179 REGISTER_KERNEL_BUILDER( \ 180 Name("SpaceToDepth").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ 181 SpaceToDepthOp<CPUDevice, type>); 182 183 TF_CALL_ALL_TYPES(REGISTER); 184 #undef REGISTER 185 186 #if GOOGLE_CUDA 187 REGISTER_KERNEL_BUILDER( 188 Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<float>("T"), 189 SpaceToDepthOp<GPUDevice, float>); 190 REGISTER_KERNEL_BUILDER( 191 Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<qint8>("T"), 192 SpaceToDepthOp<GPUDevice, qint8>); 193 #endif // GOOGLE_CUDA 194 195 } // end namespace tensorflow 196