Home | History | Annotate | Download | only in optimized
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
     17 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
     18 
     19 #include <assert.h>
     20 #include <stdint.h>
     21 #include <sys/types.h>
     22 #include <algorithm>
     23 #include <cmath>
     24 #include <limits>
     25 #include <memory>
     26 #include <tuple>
     27 #include <type_traits>
     28 
     29 #include "tensorflow/contrib/lite/builtin_op_data.h"
     30 #include "tensorflow/contrib/lite/kernels/internal/common.h"
     31 #include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
     32 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
     33 #include "tensorflow/contrib/lite/kernels/internal/types.h"
     34 
     35 namespace tflite {
     36 namespace multithreaded_ops {
     37 
     38 class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
     39  public:
     40   explicit EigenThreadPoolWrapper(Eigen::ThreadPool* pool) : pool_(pool) {}
     41   ~EigenThreadPoolWrapper() override {}
     42 
     43   void Schedule(std::function<void()> fn) override {
     44     pool_->Schedule(std::move(fn));
     45   }
     46   int NumThreads() const override { return pool_->NumThreads(); }
     47   int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
     48 
     49  private:
     50   Eigen::ThreadPool* pool_ = nullptr;
     51 };
     52 
     53 // We have a single global threadpool for all convolution operations. This means
     54 // that inferences started from different threads may block each other, but
     55 // since the underlying resource of CPU cores should be consumed by the
     56 // operations anyway, it shouldn't affect overall performance.
     57 const Eigen::ThreadPoolDevice& GetThreadPoolDevice() {
     58   const int thread_count = 4;
     59   static Eigen::ThreadPool* tp = new Eigen::ThreadPool(thread_count);
     60   static EigenThreadPoolWrapper* thread_pool_wrapper =
     61       new EigenThreadPoolWrapper(tp);
     62   static Eigen::ThreadPoolDevice* device =
     63       new Eigen::ThreadPoolDevice(thread_pool_wrapper, thread_count);
     64   return *device;
     65 }
     66 
     67 // Shorthands for the types we need when interfacing with the EigenTensor
     68 // library.
     69 typedef Eigen::TensorMap<
     70     Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
     71     EigenMatrix;
     72 typedef Eigen::TensorMap<
     73     Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
     74     Eigen::Aligned>
     75     ConstEigenMatrix;
     76 
     77 typedef Eigen::TensorMap<
     78     Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
     79     EigenTensor;
     80 typedef Eigen::TensorMap<
     81     Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
     82     Eigen::Aligned>
     83     ConstEigenTensor;
     84 
     85 // Utility functions we need for the EigenTensor API.
     86 template <typename Device, typename T>
     87 struct MatMulConvFunctor {
     88   // Computes on device "d": out = in0 * in1, where * is matrix
     89   // multiplication.
     90   void operator()(
     91       const Device& d, EigenMatrix out, ConstEigenMatrix in0,
     92       ConstEigenMatrix in1,
     93       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
     94     out.device(d) = in0.contract(in1, dim_pair);
     95   }
     96 };
     97 
     98 template <class T>
     99 class EigenTensorConvFunctor {
    100  private:
    101   Eigen::PaddingType TfLitePadding2EigenPadding(TfLitePadding padding) {
    102     switch (padding) {
    103       case kTfLitePaddingValid:
    104         return Eigen::PADDING_VALID;
    105       case kTfLitePaddingSame:
    106         return Eigen::PADDING_SAME;
    107       case kTfLitePaddingUnknown:
    108         assert(false);  // should never get here.
    109         return Eigen::PADDING_VALID;
    110     }
    111     return Eigen::PADDING_SAME;  // Prevent compiler warning about missing
    112                                  // return
    113   }
    114 
    115  public:
    116   void operator()(const T* input_data, T* im2col_buffer, int input_batches,
    117                   int input_height, int input_width, int input_depth,
    118                   const T* filter_data, int filter_height, int filter_width,
    119                   int filter_count, int stride_rows, int stride_cols,
    120                   int pad_width, int pad_height, TfLitePadding padding,
    121                   T* output_data, int output_height, int output_width) {
    122     const Eigen::ThreadPoolDevice& device = GetThreadPoolDevice();
    123 
    124     const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 &&
    125                                 stride_rows == 1 && stride_cols == 1);
    126     if (is_1x1_kernel) {
    127       // For 1x1 kernel, the 2D convolution is reduced to matrix
    128       // multiplication.
    129       const int conv_width = output_height * output_width;
    130       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
    131       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
    132       EigenMatrix output(output_data, conv_width, filter_count);
    133       ConstEigenMatrix input(input_data, conv_width, input_depth);
    134       ConstEigenMatrix filter(filter_data, input_depth, filter_count);
    135       MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
    136                                                       filter, dim_pair);
    137     } else if (filter_height == input_height && filter_width == input_width &&
    138                pad_width == 0 && pad_height == 0) {
    139       // If the input data and filter have the same height/width,
    140       // the 2D convolution is reduced to matrix multiplication.
    141       const int k =  // Length of reduction dimension.
    142           filter_width * filter_height * input_depth;
    143       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
    144       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
    145       EigenMatrix output(output_data, 1, filter_count);
    146       ConstEigenMatrix input(input_data, 1, k);
    147       ConstEigenMatrix filter(filter_data, k, filter_count);
    148       MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
    149                                                       filter, dim_pair);
    150     } else {
    151       EigenTensor output(output_data, input_batches, output_height,
    152                          output_width, filter_count);
    153       ConstEigenTensor input(input_data, input_batches, input_height,
    154                              input_width, input_depth);
    155       ConstEigenTensor filter(filter_data, filter_height, filter_width,
    156                               input_depth, filter_count);
    157       output.device(device) =
    158           Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
    159                                     TfLitePadding2EigenPadding(padding));
    160     }
    161   }
    162 };
    163 
    164 inline void Conv(const float* input_data, const Dims<4>& input_dims,
    165                  const float* filter_data, const Dims<4>& filter_dims,
    166                  const float* bias_data, const Dims<4>& bias_dims,
    167                  int stride_width, int stride_height, int pad_width,
    168                  int pad_height, TfLitePadding padding,
    169                  float output_activation_min, float output_activation_max,
    170                  float* output_data, const Dims<4>& output_dims,
    171                  float* im2col_data, const Dims<4>& im2col_dims) {
    172   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    173   const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
    174   const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
    175   const int input_height = ArraySize(input_dims, 2);
    176   const int input_width = ArraySize(input_dims, 1);
    177   const int filter_height = ArraySize(filter_dims, 2);
    178   const int filter_width = ArraySize(filter_dims, 1);
    179   const int output_height = ArraySize(output_dims, 2);
    180   const int output_width = ArraySize(output_dims, 1);
    181   EigenTensorConvFunctor<float> conv_functor;
    182   conv_functor(input_data, im2col_data, batches, input_height, input_width,
    183                input_depth, filter_data, filter_height, filter_width,
    184                output_depth, stride_height, stride_width, pad_height, pad_width,
    185                padding, output_data, output_height, output_width);
    186 
    187   optimized_ops::AddBiasAndEvalActivationFunction(
    188       bias_data, bias_dims, output_data, output_dims, output_activation_min,
    189       output_activation_max);
    190 }
    191 
    192 }  // namespace multithreaded_ops
    193 }  // namespace tflite
    194 
    195 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
    196