1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV 17 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV 18 19 #include <assert.h> 20 #include <stdint.h> 21 #include <sys/types.h> 22 #include <algorithm> 23 #include <cmath> 24 #include <limits> 25 #include <memory> 26 #include <tuple> 27 #include <type_traits> 28 29 #include "tensorflow/contrib/lite/builtin_op_data.h" 30 #include "tensorflow/contrib/lite/kernels/internal/common.h" 31 #include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h" 32 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" 33 #include "tensorflow/contrib/lite/kernels/internal/types.h" 34 35 namespace tflite { 36 namespace multithreaded_ops { 37 38 class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface { 39 public: 40 explicit EigenThreadPoolWrapper(Eigen::ThreadPool* pool) : pool_(pool) {} 41 ~EigenThreadPoolWrapper() override {} 42 43 void Schedule(std::function<void()> fn) override { 44 pool_->Schedule(std::move(fn)); 45 } 46 int NumThreads() const override { return pool_->NumThreads(); } 47 int CurrentThreadId() const override { return pool_->CurrentThreadId(); } 48 49 private: 50 Eigen::ThreadPool* pool_ = nullptr; 51 }; 52 53 // We have a single global threadpool for all convolution operations. This means 54 // that inferences started from different threads may block each other, but 55 // since the underlying resource of CPU cores should be consumed by the 56 // operations anyway, it shouldn't affect overall performance. 57 const Eigen::ThreadPoolDevice& GetThreadPoolDevice() { 58 const int thread_count = 4; 59 static Eigen::ThreadPool* tp = new Eigen::ThreadPool(thread_count); 60 static EigenThreadPoolWrapper* thread_pool_wrapper = 61 new EigenThreadPoolWrapper(tp); 62 static Eigen::ThreadPoolDevice* device = 63 new Eigen::ThreadPoolDevice(thread_pool_wrapper, thread_count); 64 return *device; 65 } 66 67 // Shorthands for the types we need when interfacing with the EigenTensor 68 // library. 69 typedef Eigen::TensorMap< 70 Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> 71 EigenMatrix; 72 typedef Eigen::TensorMap< 73 Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>, 74 Eigen::Aligned> 75 ConstEigenMatrix; 76 77 typedef Eigen::TensorMap< 78 Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> 79 EigenTensor; 80 typedef Eigen::TensorMap< 81 Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>, 82 Eigen::Aligned> 83 ConstEigenTensor; 84 85 // Utility functions we need for the EigenTensor API. 86 template <typename Device, typename T> 87 struct MatMulConvFunctor { 88 // Computes on device "d": out = in0 * in1, where * is matrix 89 // multiplication. 90 void operator()( 91 const Device& d, EigenMatrix out, ConstEigenMatrix in0, 92 ConstEigenMatrix in1, 93 const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) { 94 out.device(d) = in0.contract(in1, dim_pair); 95 } 96 }; 97 98 template <class T> 99 class EigenTensorConvFunctor { 100 private: 101 Eigen::PaddingType TfLitePadding2EigenPadding(TfLitePadding padding) { 102 switch (padding) { 103 case kTfLitePaddingValid: 104 return Eigen::PADDING_VALID; 105 case kTfLitePaddingSame: 106 return Eigen::PADDING_SAME; 107 case kTfLitePaddingUnknown: 108 assert(false); // should never get here. 109 return Eigen::PADDING_VALID; 110 } 111 return Eigen::PADDING_SAME; // Prevent compiler warning about missing 112 // return 113 } 114 115 public: 116 void operator()(const T* input_data, T* im2col_buffer, int input_batches, 117 int input_height, int input_width, int input_depth, 118 const T* filter_data, int filter_height, int filter_width, 119 int filter_count, int stride_rows, int stride_cols, 120 int pad_width, int pad_height, TfLitePadding padding, 121 T* output_data, int output_height, int output_width) { 122 const Eigen::ThreadPoolDevice& device = GetThreadPoolDevice(); 123 124 const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 && 125 stride_rows == 1 && stride_cols == 1); 126 if (is_1x1_kernel) { 127 // For 1x1 kernel, the 2D convolution is reduced to matrix 128 // multiplication. 129 const int conv_width = output_height * output_width; 130 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; 131 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); 132 EigenMatrix output(output_data, conv_width, filter_count); 133 ConstEigenMatrix input(input_data, conv_width, input_depth); 134 ConstEigenMatrix filter(filter_data, input_depth, filter_count); 135 MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input, 136 filter, dim_pair); 137 } else if (filter_height == input_height && filter_width == input_width && 138 pad_width == 0 && pad_height == 0) { 139 // If the input data and filter have the same height/width, 140 // the 2D convolution is reduced to matrix multiplication. 141 const int k = // Length of reduction dimension. 142 filter_width * filter_height * input_depth; 143 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; 144 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); 145 EigenMatrix output(output_data, 1, filter_count); 146 ConstEigenMatrix input(input_data, 1, k); 147 ConstEigenMatrix filter(filter_data, k, filter_count); 148 MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input, 149 filter, dim_pair); 150 } else { 151 EigenTensor output(output_data, input_batches, output_height, 152 output_width, filter_count); 153 ConstEigenTensor input(input_data, input_batches, input_height, 154 input_width, input_depth); 155 ConstEigenTensor filter(filter_data, filter_height, filter_width, 156 input_depth, filter_count); 157 output.device(device) = 158 Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows, 159 TfLitePadding2EigenPadding(padding)); 160 } 161 } 162 }; 163 164 inline void Conv(const float* input_data, const Dims<4>& input_dims, 165 const float* filter_data, const Dims<4>& filter_dims, 166 const float* bias_data, const Dims<4>& bias_dims, 167 int stride_width, int stride_height, int pad_width, 168 int pad_height, TfLitePadding padding, 169 float output_activation_min, float output_activation_max, 170 float* output_data, const Dims<4>& output_dims, 171 float* im2col_data, const Dims<4>& im2col_dims) { 172 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 173 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 174 const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); 175 const int input_height = ArraySize(input_dims, 2); 176 const int input_width = ArraySize(input_dims, 1); 177 const int filter_height = ArraySize(filter_dims, 2); 178 const int filter_width = ArraySize(filter_dims, 1); 179 const int output_height = ArraySize(output_dims, 2); 180 const int output_width = ArraySize(output_dims, 1); 181 EigenTensorConvFunctor<float> conv_functor; 182 conv_functor(input_data, im2col_data, batches, input_height, input_width, 183 input_depth, filter_data, filter_height, filter_width, 184 output_depth, stride_height, stride_width, pad_height, pad_width, 185 padding, output_data, output_height, output_width); 186 187 optimized_ops::AddBiasAndEvalActivationFunction( 188 bias_data, bias_dims, output_data, output_dims, output_activation_min, 189 output_activation_max); 190 } 191 192 } // namespace multithreaded_ops 193 } // namespace tflite 194 195 #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV 196