Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // unpack.h: unpacking the result blocks computed by compute.h,
     16 // storing them into the destination matrix.
     17 
     18 #ifndef GEMMLOWP_INTERNAL_UNPACK_H_
     19 #define GEMMLOWP_INTERNAL_UNPACK_H_
     20 
     21 #include "allocator.h"
     22 #include "block_params.h"
     23 #include "output.h"
     24 #include "pack.h"
     25 
     26 #include <cmath>
     27 
     28 namespace gemmlowp {
     29 
     30 class PackedResult {
     31  public:
     32   PackedResult(Allocator* _allocator, const BlockParams& _block_params)
     33       : allocator_(_allocator), block_params_(_block_params) {
     34     matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
     35                                                        block_params_.l2_cols);
     36   }
     37 
     38   ~PackedResult() {}
     39 
     40   MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
     41     return MatrixMap<std::int32_t, MapOrder::ColMajor>(
     42         allocator_->GetPointer<std::int32_t>(matrix_handle_),
     43         block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
     44   }
     45 
     46   MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
     47     return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
     48         allocator_->GetPointer<const std::int32_t>(matrix_handle_),
     49         block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
     50   }
     51 
     52  private:
     53   Allocator* allocator_;
     54   Allocator::Handle matrix_handle_;
     55   const BlockParams& block_params_;
     56 };
     57 
     58 template <std::uint32_t numerator, std::uint32_t denominator>
     59 std::int32_t RoundingMultiplyByConstantFraction(std::int32_t x) {
     60   if (numerator == denominator) {
     61     return x;
     62   }
     63 
     64   // We'll use only signed arithmetic here. This is
     65   // simpler (since this function operates on signed int32's) and
     66   // more friendly to ARM NEON, where this allows us to use the
     67   // VQRDMULH instruction.
     68   static const std::int32_t int_quotient =
     69       (numerator + denominator / 2) / denominator;
     70   static const std::int32_t remaining_numerator =
     71       numerator - int_quotient * denominator;
     72   static const std::int32_t scaled_remaining_numerator =
     73       static_cast<std::int32_t>(
     74           (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) /
     75           denominator);
     76 
     77   const std::int64_t scaled_remaining_product =
     78       static_cast<std::int64_t>(x) *
     79       static_cast<std::int64_t>(scaled_remaining_numerator);
     80 
     81   const std::int32_t scaled_remaining_product_nudge =
     82       (scaled_remaining_product > 0 ? 1 : -1) * (1 << 30);
     83 
     84   const std::int32_t remaining_product = static_cast<std::int32_t>(
     85       (scaled_remaining_product + scaled_remaining_product_nudge) / (1u << 31));
     86 
     87   return x * int_quotient + remaining_product;
     88 }
     89 
     90 template <typename BitDepthParams, typename ResultBlockType,
     91           typename PackedResultType, typename LhsOffset, typename RhsOffset,
     92           typename OutputPipelineType>
     93 struct UnpackResultImplGeneric {
     94   static void Unpack(ResultBlockType* dst, const PackedResultType& src,
     95                      int depth, const std::int32_t* lhs_sums_of_each_slice,
     96                      const std::int32_t* rhs_sums_of_each_slice,
     97                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
     98                      const OutputPipelineType& output_pipeline) {
     99     auto src_map = src.Map();
    100     // No top-level blocking in the depth dimension at the moment.
    101     // Too much loss of precision.
    102     const int kLhsBits = BitDepthParams::LhsBitDepth::kBits;
    103     const int kRhsBits = BitDepthParams::RhsBitDepth::kBits;
    104     const std::int32_t kLhsMax = (1 << kLhsBits) - 1;
    105     const std::int32_t kRhsMax = (1 << kRhsBits) - 1;
    106     OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1>
    107         output_pipeline_executor(output_pipeline);
    108     for (int c = 0; c < dst->cols(); c++) {
    109       for (int r = 0; r < dst->rows(); r++) {
    110         // To understand this code, read
    111         //   doc/low-precision.txt
    112         //   doc/less-than-8-bit.txt
    113         // We have 4 terms to sum: xx, x1, 1x, 11.
    114         // In case of requantization, we first need to scale them back
    115         // to the original scale, using RoundingMultiplyByConstantFraction.
    116         std::int32_t raw_xx = src_map(r, c);
    117         std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset(c);
    118         std::int32_t raw_1x = rhs_sums_of_each_slice[c] * lhs_offset(r);
    119         std::int32_t term_xx =
    120             RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
    121                 raw_xx);
    122         std::int32_t term_x1 =
    123             RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1);
    124         std::int32_t term_1x =
    125             RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x);
    126         std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth;
    127         // Sum the 4 terms.
    128         FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11;
    129 
    130         output_pipeline_executor.Execute(sum, dst, r, c);
    131       }
    132     }
    133   }
    134 };
    135 
    136 template <typename BitDepthParams, typename ResultBlockType,
    137           typename PackedResultType, typename LhsOffset, typename RhsOffset,
    138           typename OutputPipelineType>
    139 struct UnpackResultImpl
    140     : UnpackResultImplGeneric<BitDepthParams, ResultBlockType, PackedResultType,
    141                               LhsOffset, RhsOffset, OutputPipelineType> {};
    142 
    143 template <typename BitDepthParams, typename ResultBlockType,
    144           typename PackedResultType, typename LhsOffset, typename RhsOffset,
    145           typename OutputPipelineType>
    146 void UnpackResult(ResultBlockType* dst, const PackedResultType& src, int depth,
    147                   const std::int32_t* lhs_sums_of_each_slice,
    148                   const std::int32_t* rhs_sums_of_each_slice,
    149                   const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
    150                   const OutputPipelineType& output_pipeline) {
    151   ScopedProfilingLabel label("unpack");
    152   UnpackResultImpl<BitDepthParams, ResultBlockType, PackedResultType,
    153                    LhsOffset, RhsOffset, OutputPipelineType>::Unpack(
    154       dst, src, depth, lhs_sums_of_each_slice, rhs_sums_of_each_slice,
    155       lhs_offset, rhs_offset, output_pipeline);
    156 }
    157 
    158 }  // namespace gemmlowp
    159 
    160 #ifdef GEMMLOWP_NEON
    161 #include "unpack_neon.h"
    162 #endif
    163 
    164 #endif  // GEMMLOWP_INTERNAL_UNPACK_H_
    165