Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/array_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #include <memory>
     21 #include <string>
     22 #include <utility>
     23 
     24 #include "tensorflow/core/kernels/spacetodepth_op.h"
     25 
     26 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     27 #include "tensorflow/core/framework/op.h"
     28 #include "tensorflow/core/framework/op_kernel.h"
     29 #include "tensorflow/core/framework/register_types.h"
     30 #include "tensorflow/core/framework/tensor.h"
     31 #include "tensorflow/core/framework/tensor_shape.h"
     32 #include "tensorflow/core/framework/tensor_types.h"
     33 #include "tensorflow/core/framework/types.h"
     34 #include "tensorflow/core/platform/logging.h"
     35 #include "tensorflow/core/platform/types.h"
     36 #include "tensorflow/core/util/tensor_format.h"
     37 
     38 namespace tensorflow {
     39 
     40 typedef Eigen::ThreadPoolDevice CPUDevice;
     41 typedef Eigen::GpuDevice GPUDevice;
     42 
     43 template <typename Device, typename T>
     44 class SpaceToDepthOp : public OpKernel {
     45  public:
     46   explicit SpaceToDepthOp(OpKernelConstruction* context) : OpKernel(context) {
     47     string data_format_str;
     48     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
     49     OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
     50                 errors::InvalidArgument("Invalid data format"));
     51 
     52     OP_REQUIRES_OK(context, context->GetAttr("block_size", &block_size_));
     53     OP_REQUIRES(context, block_size_ > 1,
     54                 errors::InvalidArgument("Block size should be > 1, but was: ",
     55                                         block_size_));
     56 
     57     if (std::is_same<Device, CPUDevice>::value) {
     58       OP_REQUIRES(
     59           context, data_format_ == FORMAT_NHWC,
     60           errors::InvalidArgument(
     61               "Only NHWC data_format supported on CPU. Got ", data_format_str));
     62     }
     63   }
     64 
     65   void Compute(OpKernelContext* context) override {
     66     const Tensor& input = context->input(0);
     67     const int dims = input.dims();
     68 
     69     // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here.
     70     constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
     71     OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
     72                 errors::InvalidArgument(
     73                     "qint8 should be used with data_format NCHW_VECT_C."));
     74 
     75     constexpr int kVect = is_int8x4 ? 4 : 1;
     76     constexpr int kDims = is_int8x4 ? 5 : 4;
     77     OP_REQUIRES(context, kDims == dims,
     78                 errors::InvalidArgument("Input rank should be: ", kDims,
     79                                         " instead of: ", dims));
     80 
     81     constexpr int kNumSpatialDims = 2;
     82     const int batch_size =
     83         input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'N'));
     84     const int height =
     85         input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'H'));
     86     const int width =
     87         input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'W'));
     88     const int input_depth =
     89         input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'C')) *
     90         kVect;
     91 
     92     // Both width and height must be divisible by block_size.
     93     OP_REQUIRES(context,
     94                 (width % block_size_) == 0 && (height % block_size_) == 0,
     95                 errors::InvalidArgument(
     96                     "Image width ", width, " and height ", height,
     97                     " should be divisible by block_size: ", block_size_));
     98 
     99     // The 'spatial' block of size block_size_ X block_size_ will be moved
    100     // to depth.
    101     const int output_depth = input_depth * block_size_ * block_size_;
    102     const int output_width = width / block_size_;
    103     const int output_height = height / block_size_;
    104 
    105     // Allocate output tensor.
    106     Tensor* outputs_tensor = nullptr;
    107     OP_REQUIRES_OK(context,
    108                    context->allocate_output(
    109                        0,
    110                        ShapeFromFormat(data_format_, batch_size, output_height,
    111                                        output_width, output_depth),
    112                        &outputs_tensor));
    113 
    114     auto Tinput = input.tensor<T, kDims>();
    115     auto Toutput = outputs_tensor->tensor<T, kDims>();
    116 
    117     if (std::is_same<Device, GPUDevice>::value) {
    118       if (is_int8x4) {
    119         // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
    120         auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
    121         auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
    122         functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
    123         functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
    124                 Toutput_v);
    125         return;
    126       } else if (data_format_ == FORMAT_NCHW) {
    127         functor::SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;
    128         functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,
    129                 Toutput);
    130         return;
    131       }
    132     }
    133 
    134     // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected
    135     // (CPU && data_format_ != FORMAT_NHWC) in the constructor.
    136 
    137     if (!is_int8x4) {
    138       functor::SpaceToDepthOpFunctor<Device, T, FORMAT_NHWC> functor;
    139       functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
    140     }
    141   };
    142 
    143  private:
    144   int block_size_;
    145   TensorFormat data_format_;
    146 };
    147 
    148 // Partial specialization of SpaceToDepthOpFunctor for a CPUDevice.
    149 namespace functor {
    150 template <typename T>
    151 struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NHWC> {
    152   void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
    153                   int block_size, typename TTypes<T, 4>::Tensor output) {
    154     const int batch_size = output.dimension(0);
    155     const int input_height = input.dimension(1);
    156     const int input_width = input.dimension(2);
    157     const int input_depth = input.dimension(3);
    158 
    159     for (int b = 0; b < batch_size; ++b) {
    160       for (int h = 0; h < input_height; ++h) {
    161         const int out_h = h / block_size;
    162         const int offset_h = (h % block_size);
    163         for (int w = 0; w < input_width; ++w) {
    164           const int out_w = w / block_size;
    165           const int offset_w = (w % block_size);
    166           const int offset_d = (offset_h * block_size + offset_w) * input_depth;
    167           for (int d = 0; d < input_depth; ++d) {
    168             const int out_d = d + offset_d;
    169             output(b, out_h, out_w, out_d) = input(b, h, w, d);
    170           }
    171         }
    172       }
    173     }
    174   }
    175 };
    176 }  // namespace functor
    177 
    178 #define REGISTER(type)                                                   \
    179   REGISTER_KERNEL_BUILDER(                                               \
    180       Name("SpaceToDepth").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
    181       SpaceToDepthOp<CPUDevice, type>);
    182 
    183 TF_CALL_ALL_TYPES(REGISTER);
    184 #undef REGISTER
    185 
    186 #if GOOGLE_CUDA
    187 REGISTER_KERNEL_BUILDER(
    188     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<float>("T"),
    189     SpaceToDepthOp<GPUDevice, float>);
    190 REGISTER_KERNEL_BUILDER(
    191     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
    192     SpaceToDepthOp<GPUDevice, qint8>);
    193 #endif  // GOOGLE_CUDA
    194 
    195 }  // end namespace tensorflow
    196