Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // unpack_neon.h: optimized NEON specializations of the templates in unpack.h.
     16 
     17 #ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_
     18 #define GEMMLOWP_INTERNAL_UNPACK_NEON_H_
     19 
     20 #include "output_neon.h"
     21 #include "unpack.h"
     22 
     23 #include <arm_neon.h>
     24 
     25 namespace gemmlowp {
     26 
     27 template <std::uint32_t numerator, std::uint32_t denominator>
     28 int32x4_t RoundingMultiplyByConstantFraction(int32x4_t x) {
     29   static_assert(numerator > 0 && denominator > 0,
     30                 "only supporting positive num/denom");
     31 
     32   if (numerator == denominator) {
     33     return x;
     34   }
     35 
     36   static const std::int32_t int_quotient =
     37       (numerator + denominator / 2) / denominator;
     38   static const std::int32_t remaining_numerator =
     39       numerator - int_quotient * denominator;
     40   static const std::int32_t scaled_remaining_numerator =
     41       static_cast<std::int32_t>(
     42           (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) /
     43           denominator);
     44   // Note: vqrdmulh instruction is rounding doubling multiply high.
     45   const int32x4_t remaining_product =
     46       vqrdmulhq_n_s32(x, scaled_remaining_numerator);
     47 
     48   return vmlaq_n_s32(remaining_product, x, int_quotient);
     49 }
     50 
     51 template <typename tScalar, VectorShape tShape>
     52 int32x4_t get_int32x4_t_and_inc(
     53     ConstIterator<VectorMap<tScalar, tShape>>* iterator) {
     54   const int32x4_t result = vld1q_s32(iterator->get());
     55   *iterator += 4;
     56   return result;
     57 }
     58 
     59 template <typename tScalar, VectorShape tShape>
     60 int32x4_t get_int32x4_t_and_inc(
     61     ConstIterator<VectorDup<tScalar, tShape>>* iterator) {
     62   const int32x4_t result = vdupq_n_s32(**iterator);
     63   // Increment really does nothing for VectorDup.
     64   *iterator += 4;
     65   return result;
     66 }
     67 
     68 template <typename BitDepthParams, typename PackedResultType,
     69           typename OutputScalar, typename LhsOffset, typename RhsOffset,
     70           typename OutputPipelineType>
     71 struct UnpackResultImpl<BitDepthParams,
     72                         MatrixMap<OutputScalar, MapOrder::ColMajor>,
     73                         PackedResultType, LhsOffset, RhsOffset,
     74                         OutputPipelineType> {
     75   typedef MatrixMap<OutputScalar, MapOrder::ColMajor> ResultBlockType;
     76   static void Unpack(ResultBlockType* dst, const PackedResultType& src,
     77                      int depth, const std::int32_t* lhs_sums_of_each_slice,
     78                      const std::int32_t* rhs_sums_of_each_slice,
     79                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
     80                      const OutputPipelineType& output_pipeline) {
     81     ScopedProfilingLabel label("optimized path (NEON)");
     82     const int kLhsBits = BitDepthParams::LhsBitDepth::kBits;
     83     const int kRhsBits = BitDepthParams::RhsBitDepth::kBits;
     84     const std::int32_t kLhsMax = (1 << kLhsBits) - 1;
     85     const std::int32_t kRhsMax = (1 << kRhsBits) - 1;
     86     auto src_map = src.Map();
     87     OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1>
     88         output_pipeline_executor_int32x1x1(output_pipeline);
     89     OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x4x1>
     90         output_pipeline_executor_int32x4x1(output_pipeline);
     91     OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x16x1>
     92         output_pipeline_executor_int32x16x1(output_pipeline);
     93 
     94     for (int c = 0; c < dst->cols(); c++) {
     95       const std::int32_t* src_ptr = src_map.data(0, c);
     96       const std::int32_t* sums_of_each_slice_ptr = lhs_sums_of_each_slice;
     97       auto lhs_offset_iter = const_iterator(lhs_offset);
     98       const std::int32_t rhs_offset_c = rhs_offset(c);
     99       const std::int32_t rhs_sums_of_each_slice_c = rhs_sums_of_each_slice[c];
    100 
    101       // Handle 16 values at once for higher performance
    102       int dst_rows_aligned16 = RoundDown<16>(dst->rows());
    103       for (int r = 0; r < dst_rows_aligned16; r += 16) {
    104         // Compute the sum of the 4 terms,
    105         //   q = term_xx + term_x1 + term_1x_plus_term_11
    106         // Refer to the generic code in unpack.h.
    107         int32x4_t raw_xx[4];
    108         for (int i = 0; i < 4; i++) {
    109           raw_xx[i] = vld1q_s32(src_ptr);
    110           src_ptr += 4;
    111         }
    112         int32x4_t raw_x1[4];
    113         for (int i = 0; i < 4; i++) {
    114           const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr);
    115           raw_x1[i] = vmulq_n_s32(sum_x1, rhs_offset_c);
    116           sums_of_each_slice_ptr += 4;
    117         }
    118         int32x4_t raw_1x[4];
    119         int32x4_t term_11[4];
    120         for (int i = 0; i < 4; i++) {
    121           const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter);
    122           raw_1x[i] = vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c);
    123           term_11[i] = vmulq_n_s32(lhs_offsets, rhs_offset_c * depth);
    124         }
    125         int32x4_t term_xx[4];
    126         for (int i = 0; i < 4; i++) {
    127           term_xx[i] =
    128               RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
    129                   raw_xx[i]);
    130         }
    131         int32x4_t term_x1[4];
    132         for (int i = 0; i < 4; i++) {
    133           term_x1[i] =
    134               RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1[i]);
    135         }
    136         int32x4_t term_1x[4];
    137         for (int i = 0; i < 4; i++) {
    138           term_1x[i] =
    139               RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x[i]);
    140         }
    141         int32x4x4_t q;
    142         for (int i = 0; i < 4; i++) {
    143           q.val[i] = vaddq_s32(vaddq_s32(term_xx[i], term_x1[i]),
    144                                vaddq_s32(term_1x[i], term_11[i]));
    145         }
    146         NEONFragmentInt32x16x1 f(q);
    147         output_pipeline_executor_int32x16x1.Execute(f, dst, r, c);
    148       }
    149       // We have finished handling groups of 16 entries at once; now
    150       // try to handle 4 entries at once.
    151       int dst_rows_aligned4 = RoundDown<4>(dst->rows());
    152       for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) {
    153         // Compute the sum of the 4 terms,
    154         //   q = term_xx + term_x1 + term_1x_plus_term_11
    155         // Refer to the generic code in unpack.h.
    156         const int32x4_t raw_xx = vld1q_s32(src_ptr);
    157         src_ptr += 4;
    158         const int32x4_t term_xx =
    159             RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
    160                 raw_xx);
    161         const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr);
    162         const int32x4_t raw_x1 = vmulq_n_s32(sum_x1, rhs_offset_c);
    163         sums_of_each_slice_ptr += 4;
    164         const int32x4_t term_x1 =
    165             RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1);
    166         const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter);
    167         const int32x4_t raw_1x =
    168             vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c);
    169         const int32x4_t term_1x =
    170             RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x);
    171         const int32x4_t term_11 =
    172             vmulq_n_s32(lhs_offsets, rhs_offset_c * depth);
    173         int32x4_t q = vaddq_s32(vaddq_s32(term_xx, term_x1),
    174                                 vaddq_s32(term_1x, term_11));
    175         NEONFragmentInt32x4x1 f(q);
    176         output_pipeline_executor_int32x4x1.Execute(f, dst, r, c);
    177       }
    178       // We have finished handling 4 entries at once; now handle
    179       // remaining entries one by one. This scalar code is similar
    180       // to the code in unpack.h, see comments there.
    181       for (int r = dst_rows_aligned4; r < dst->rows(); r++) {
    182         const std::int32_t raw_xx = src_map(r, c);
    183         const std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset_c;
    184         const std::int32_t raw_1x = rhs_sums_of_each_slice_c * lhs_offset(r);
    185         const std::int32_t term_xx =
    186             RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
    187                 raw_xx);
    188         const std::int32_t term_x1 =
    189             RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1);
    190         const std::int32_t term_1x =
    191             RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x);
    192         const std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth;
    193         FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11;
    194         output_pipeline_executor_int32x1x1.Execute(sum, dst, r, c);
    195       }
    196     }
    197   }
    198 };
    199 
    200 }  // namespace gemmlowp
    201 
    202 #endif  // GEMMLOWP_INTERNAL_UNPACK_NEON_H_
    203