1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // unpack.h: unpacking the result blocks computed by compute.h, 16 // storing them into the destination matrix. 17 18 #ifndef GEMMLOWP_INTERNAL_UNPACK_H_ 19 #define GEMMLOWP_INTERNAL_UNPACK_H_ 20 21 #include "allocator.h" 22 #include "block_params.h" 23 #include "output.h" 24 #include "pack.h" 25 26 #include <cmath> 27 28 namespace gemmlowp { 29 30 class PackedResult { 31 public: 32 PackedResult(Allocator* _allocator, const BlockParams& _block_params) 33 : allocator_(_allocator), block_params_(_block_params) { 34 matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows * 35 block_params_.l2_cols); 36 } 37 38 ~PackedResult() {} 39 40 MatrixMap<std::int32_t, MapOrder::ColMajor> Map() { 41 return MatrixMap<std::int32_t, MapOrder::ColMajor>( 42 allocator_->GetPointer<std::int32_t>(matrix_handle_), 43 block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); 44 } 45 46 MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const { 47 return MatrixMap<const std::int32_t, MapOrder::ColMajor>( 48 allocator_->GetPointer<const std::int32_t>(matrix_handle_), 49 block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); 50 } 51 52 private: 53 Allocator* allocator_; 54 Allocator::Handle matrix_handle_; 55 const BlockParams& block_params_; 56 }; 57 58 template <std::uint32_t numerator, std::uint32_t denominator> 59 std::int32_t RoundingMultiplyByConstantFraction(std::int32_t x) { 60 if (numerator == denominator) { 61 return x; 62 } 63 64 // We'll use only signed arithmetic here. This is 65 // simpler (since this function operates on signed int32's) and 66 // more friendly to ARM NEON, where this allows us to use the 67 // VQRDMULH instruction. 68 static const std::int32_t int_quotient = 69 (numerator + denominator / 2) / denominator; 70 static const std::int32_t remaining_numerator = 71 numerator - int_quotient * denominator; 72 static const std::int32_t scaled_remaining_numerator = 73 static_cast<std::int32_t>( 74 (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) / 75 denominator); 76 77 const std::int64_t scaled_remaining_product = 78 static_cast<std::int64_t>(x) * 79 static_cast<std::int64_t>(scaled_remaining_numerator); 80 81 const std::int32_t scaled_remaining_product_nudge = 82 (scaled_remaining_product > 0 ? 1 : -1) * (1 << 30); 83 84 const std::int32_t remaining_product = static_cast<std::int32_t>( 85 (scaled_remaining_product + scaled_remaining_product_nudge) / (1u << 31)); 86 87 return x * int_quotient + remaining_product; 88 } 89 90 template <typename BitDepthParams, typename ResultBlockType, 91 typename PackedResultType, typename LhsOffset, typename RhsOffset, 92 typename OutputPipelineType> 93 struct UnpackResultImplGeneric { 94 static void Unpack(ResultBlockType* dst, const PackedResultType& src, 95 int depth, const std::int32_t* lhs_sums_of_each_slice, 96 const std::int32_t* rhs_sums_of_each_slice, 97 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 98 const OutputPipelineType& output_pipeline) { 99 auto src_map = src.Map(); 100 // No top-level blocking in the depth dimension at the moment. 101 // Too much loss of precision. 102 const int kLhsBits = BitDepthParams::LhsBitDepth::kBits; 103 const int kRhsBits = BitDepthParams::RhsBitDepth::kBits; 104 const std::int32_t kLhsMax = (1 << kLhsBits) - 1; 105 const std::int32_t kRhsMax = (1 << kRhsBits) - 1; 106 OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1> 107 output_pipeline_executor(output_pipeline); 108 for (int c = 0; c < dst->cols(); c++) { 109 for (int r = 0; r < dst->rows(); r++) { 110 // To understand this code, read 111 // doc/low-precision.txt 112 // doc/less-than-8-bit.txt 113 // We have 4 terms to sum: xx, x1, 1x, 11. 114 // In case of requantization, we first need to scale them back 115 // to the original scale, using RoundingMultiplyByConstantFraction. 116 std::int32_t raw_xx = src_map(r, c); 117 std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset(c); 118 std::int32_t raw_1x = rhs_sums_of_each_slice[c] * lhs_offset(r); 119 std::int32_t term_xx = 120 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( 121 raw_xx); 122 std::int32_t term_x1 = 123 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1); 124 std::int32_t term_1x = 125 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x); 126 std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth; 127 // Sum the 4 terms. 128 FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11; 129 130 output_pipeline_executor.Execute(sum, dst, r, c); 131 } 132 } 133 } 134 }; 135 136 template <typename BitDepthParams, typename ResultBlockType, 137 typename PackedResultType, typename LhsOffset, typename RhsOffset, 138 typename OutputPipelineType> 139 struct UnpackResultImpl 140 : UnpackResultImplGeneric<BitDepthParams, ResultBlockType, PackedResultType, 141 LhsOffset, RhsOffset, OutputPipelineType> {}; 142 143 template <typename BitDepthParams, typename ResultBlockType, 144 typename PackedResultType, typename LhsOffset, typename RhsOffset, 145 typename OutputPipelineType> 146 void UnpackResult(ResultBlockType* dst, const PackedResultType& src, int depth, 147 const std::int32_t* lhs_sums_of_each_slice, 148 const std::int32_t* rhs_sums_of_each_slice, 149 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 150 const OutputPipelineType& output_pipeline) { 151 ScopedProfilingLabel label("unpack"); 152 UnpackResultImpl<BitDepthParams, ResultBlockType, PackedResultType, 153 LhsOffset, RhsOffset, OutputPipelineType>::Unpack( 154 dst, src, depth, lhs_sums_of_each_slice, rhs_sums_of_each_slice, 155 lhs_offset, rhs_offset, output_pipeline); 156 } 157 158 } // namespace gemmlowp 159 160 #ifdef GEMMLOWP_NEON 161 #include "unpack_neon.h" 162 #endif 163 164 #endif // GEMMLOWP_INTERNAL_UNPACK_H_ 165