Home | History | Annotate | Download | only in neon
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include <algorithm>
     17 #include <cmath>
     18 #include <type_traits>
     19 
     20 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
     21 #include "public/gemmlowp.h"
     22 #include "tensorflow/core/framework/numeric_op.h"
     23 #include "tensorflow/core/framework/op_kernel.h"
     24 #include "tensorflow/core/framework/register_types.h"
     25 #include "tensorflow/core/framework/tensor.h"
     26 #include "tensorflow/core/framework/tensor_shape.h"
     27 #include "tensorflow/core/framework/tensor_types.h"
     28 #include "tensorflow/core/framework/types.h"
     29 #include "tensorflow/core/kernels/bounds_check.h"
     30 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
     31 #include "tensorflow/core/kernels/ops_util.h"
     32 #include "tensorflow/core/lib/core/status.h"
     33 #include "tensorflow/core/platform/logging.h"
     34 #include "tensorflow/core/platform/mem.h"
     35 #include "tensorflow/core/platform/types.h"
     36 #include "tensorflow/core/util/padding.h"
     37 
     38 namespace tensorflow {
     39 
     40 // A version of tensorflow/core/kernels/depthwise_conv_op.cc that
     41 // uses the neon intrinsics.
     42 class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
     43  public:
     44   explicit NeonDepthwiseConv2dNativeOp(OpKernelConstruction* context)
     45       : BinaryOp<float>(context) {
     46     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     47     OP_REQUIRES(context, strides_.size() == 4,
     48                 errors::InvalidArgument("Sliding window strides field must "
     49                                         "specify 4 dimensions"));
     50     OP_REQUIRES(context, strides_[1] == strides_[2],
     51                 errors::InvalidArgument(
     52                     "Current implementation only supports equal length "
     53                     "strides in the row and column dimensions."));
     54     OP_REQUIRES(
     55         context, (strides_[0] == 1 && strides_[3] == 1),
     56         errors::InvalidArgument("Current implementation does not yet support "
     57                                 "strides in the batch and depth dimensions."));
     58     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     59   }
     60 
     61   void Compute(OpKernelContext* context) override {
     62     const Tensor& input = context->input(0);
     63     const Tensor& filter = context->input(1);
     64 
     65     // For 2D convolution, there should be 4 dimensions.
     66     OP_REQUIRES(context, input.dims() == 4,
     67                 errors::InvalidArgument("input must be 4-dimensional",
     68                                         input.shape().DebugString()));
     69     OP_REQUIRES(context, filter.dims() == 4,
     70                 errors::InvalidArgument("filter must be 4-dimensional: ",
     71                                         filter.shape().DebugString()));
     72 
     73     const int32 in_depth = input.dim_size(3);
     74     OP_REQUIRES(context, in_depth == filter.dim_size(2),
     75                 errors::InvalidArgument(
     76                     "input and filter must have the same depth: ", in_depth,
     77                     " vs ", filter.dim_size(2)));
     78     const int32 batch = input.dim_size(0);
     79     const int32 input_rows = input.dim_size(1);
     80     const int32 input_cols = input.dim_size(2);
     81 
     82     const int32 filter_rows = filter.dim_size(0);
     83     const int32 filter_cols = filter.dim_size(1);
     84     const int32 depth_multiplier = filter.dim_size(3);
     85 
     86     const int32 out_depth = in_depth * depth_multiplier;
     87 
     88     const int32 stride = strides_[1];
     89 
     90     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
     91     OP_REQUIRES_OK(context,
     92                    GetWindowedOutputSize(input_rows, filter_rows, stride,
     93                                          padding_, &out_rows, &pad_rows));
     94     OP_REQUIRES_OK(context,
     95                    GetWindowedOutputSize(input_cols, filter_cols, stride,
     96                                          padding_, &out_cols, &pad_cols));
     97     TensorShape out_shape({batch, out_rows, out_cols, out_depth});
     98     OP_REQUIRES(
     99         context,
    100         FastBoundsCheck(out_shape.num_elements(),
    101                         std::numeric_limits<int32>::max()),
    102         errors::InvalidArgument("Output elements too large for NEON kernel"));
    103 
    104     // Output tensor is of the following dimensions:
    105     // [ in_batch, out_rows, out_cols, out_depth ]
    106     Tensor* output = nullptr;
    107     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
    108 
    109     VLOG(2) << "NeonDepthwiseConv2dNative: "
    110             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
    111             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
    112             << filter_cols << ", " << in_depth << ", " << depth_multiplier
    113             << "]; stride = " << stride << ", pad_rows = " << pad_rows
    114             << ", pad_cols = " << pad_cols << ", output: [" << batch << ", "
    115             << out_rows << ", " << out_cols << ", " << out_depth << "]";
    116 
    117     // If there is nothing to compute, return.
    118     if (out_shape.num_elements() == 0) {
    119       return;
    120     }
    121 
    122     const float* input_ptr = input.template flat<float>().data();
    123     const float* filter_ptr = filter.template flat<float>().data();
    124     float* output_ptr = output->template flat<float>().data();
    125 
    126     auto input_neon_dims = ToNeonDims(input.shape());
    127     auto filter_neon_dims = FilterToNeonDims(filter.shape());
    128     auto bias_neon_dims = BiasNeonDims(filter.shape());
    129 
    130     int64 bias_size = bias_neon_dims.sizes[0];
    131     float* bias_ptr = static_cast<float*>(port::AlignedMalloc(
    132         bias_size * sizeof(float), Allocator::kAllocatorAlignment));
    133     memset(bias_ptr, 0, bias_size * sizeof(float));
    134 
    135     neon::DepthwiseConv<neon::FusedActivationFunctionType::kNone>(
    136         input_ptr, input_neon_dims, filter_ptr, filter_neon_dims, bias_ptr,
    137         bias_neon_dims, stride, pad_cols, pad_rows, depth_multiplier,
    138         output_ptr, ToNeonDims(out_shape));
    139 
    140     port::AlignedFree(bias_ptr);
    141   }
    142 
    143  private:
    144   void SetNeonDimStrides(neon::Dims<4>* d) {
    145     int64 stride = 1;
    146     for (int i = 0; i < 4; ++i) {
    147       d->strides[i] = stride;
    148       stride *= d->sizes[i];
    149     }
    150   }
    151 
    152   neon::Dims<4> ToNeonDims(const TensorShape& input) {
    153     // Dims in the neon kernels are channel, x, y, batch order.
    154     neon::Dims<4> result;
    155     result.sizes[0] = input.dim_size(3);
    156     result.sizes[1] = input.dim_size(2);
    157     result.sizes[2] = input.dim_size(1);
    158     result.sizes[3] = input.dim_size(0);
    159     SetNeonDimStrides(&result);
    160     return result;
    161   }
    162 
    163   neon::Dims<4> FilterToNeonDims(const TensorShape& filter) {
    164     // Dims in the neon kernels are channel, x, y, batch order.
    165     neon::Dims<4> result;
    166     result.sizes[0] = filter.dim_size(2) * filter.dim_size(3);
    167     result.sizes[1] = filter.dim_size(1);
    168     result.sizes[2] = filter.dim_size(0);
    169     result.sizes[3] = 1;
    170     SetNeonDimStrides(&result);
    171 
    172     return result;
    173   }
    174 
    175   neon::Dims<4> BiasNeonDims(const TensorShape& filter) {
    176     // Dims in the neon kernels are channel, x, y, batch order.
    177     // Bias has only output channel set.
    178     neon::Dims<4> result;
    179     result.sizes[0] =
    180         filter.dim_size(2) * filter.dim_size(3);  // output channels
    181     result.sizes[1] = 1;
    182     result.sizes[2] = 1;
    183     result.sizes[3] = 1;
    184     SetNeonDimStrides(&result);
    185 
    186     return result;
    187   }
    188 
    189   std::vector<int32> strides_;
    190   Padding padding_;
    191 
    192   TF_DISALLOW_COPY_AND_ASSIGN(NeonDepthwiseConv2dNativeOp);
    193 };
    194 
    195 #define REGISTER_CPU_KERNEL(T)                            \
    196   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")   \
    197                               .Device(DEVICE_CPU)         \
    198                               .TypeConstraint<float>("T") \
    199                               .Label("neon"),             \
    200                           NeonDepthwiseConv2dNativeOp);
    201 
    202 TF_CALL_float(REGISTER_CPU_KERNEL);
    203 
    204 }  // namespace tensorflow
    205