Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // output_neon.h: optimized NEON specializations of the templates in output.h.
     16 
     17 #ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
     18 #define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
     19 
     20 #include "output.h"
     21 
     22 #include <arm_neon.h>
     23 
     24 namespace gemmlowp {
     25 
     26 // Definitions of Fragment types wrapping NEON vector types.
     27 typedef Fragment<int32x4_t, 4, 1, MapOrder::ColMajor> NEONFragmentInt32x4x1;
     28 typedef Fragment<int32x4x4_t, 16, 1, MapOrder::ColMajor> NEONFragmentInt32x16x1;
     29 typedef Fragment<uint8x8_t, 4, 1, MapOrder::ColMajor> NEONFragmentUint8x4x1;
     30 typedef Fragment<uint8x16_t, 16, 1, MapOrder::ColMajor> NEONFragmentUint8x16x1;
     31 
     32 // The code in unpack_neon.h will whenever possible process
     33 // 16 entries at once (4 SIMD vectors of 4 entries each at once),
     34 // to offer the compiler better optimization opportunities, reducing
     35 // register dependencies. From the perspective of interfacing with the output
     36 // pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t
     37 // data. In most cases, such data is handled simply by handling separately its
     38 // 4 int32x4_t components. This partial specialization handles that for
     39 // arbitrary output stages implementing a int32x4_t path. Only some output
     40 // stages below will override this to use custom code to handle int32x4x4_t
     41 // data all at once (see OutputStageSaturatingCastToUint8 below).
     42 template <typename OutputStageType>
     43 struct OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x16x1> {
     44   typedef NEONFragmentInt32x16x1 InputType;
     45   typedef NEONFragmentInt32x16x1 OutputType;
     46   typedef OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x4x1>
     47       ImplInt32x4;
     48   OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {}
     49 
     50   OutputType Eval(InputType input, int row, int col) const {
     51     OutputType output;
     52 
     53     for (int i = 0; i < 4; i++) {
     54       output.data.val[i] =
     55           impl_int32x4.Eval(input.data.val[i], row + 4 * i, col);
     56     }
     57     return output;
     58   }
     59 
     60   ImplInt32x4 impl_int32x4;
     61 };
     62 
     63 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for
     64 // NEONFragmentInt32x4x1
     65 template <>
     66 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
     67                            NEONFragmentInt32x4x1> {
     68   typedef NEONFragmentInt32x4x1 InputType;
     69   typedef NEONFragmentInt32x4x1 OutputType;
     70   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
     71 
     72   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
     73 
     74   OutputType Eval(InputType input, int, int) const {
     75     const std::int32_t result_shift = output_stage.result_shift;
     76     const std::int32_t result_mult_int = output_stage.result_mult_int;
     77     const std::int32_t result_offset = output_stage.result_offset;
     78     const std::int32_t preshift_offset =
     79         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     80     const int32x4_t a = vaddq_s32(input, vdupq_n_s32(result_offset));
     81     const int32x4_t b =
     82         vmlaq_n_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
     83     return vshlq_s32(b, vdupq_n_s32(-result_shift));
     84   }
     85 
     86   const OutputStage& output_stage;
     87 };
     88 
     89 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
     90 // NEONFragmentInt32x4x1
     91 template <>
     92 struct OutputStageEvalImpl<
     93     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
     94     NEONFragmentInt32x4x1> {
     95   typedef NEONFragmentInt32x4x1 InputType;
     96   typedef NEONFragmentInt32x4x1 OutputType;
     97   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
     98       OutputStage;
     99 
    100   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    101 
    102   OutputType Eval(InputType input, int row, int col) const {
    103     const std::int32_t result_shift = output_stage.result_shift;
    104     const std::int32_t preshift_offset =
    105         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
    106     const int32x4_t result_mult_int =
    107         vld1q_s32(output_stage.result_mult_int.data(row));
    108     const int32x4_t result_offset =
    109         vld1q_s32(output_stage.result_offset.data(row));
    110     const int32x4_t a = vaddq_s32(input, result_offset);
    111     const int32x4_t b =
    112         vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
    113     return vshlq_s32(b, vdupq_n_s32(-result_shift));
    114   }
    115 
    116   const OutputStage& output_stage;
    117 };
    118 
    119 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
    120 // NEONFragmentInt32x4x1
    121 template <>
    122 struct OutputStageEvalImpl<
    123     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
    124     NEONFragmentInt32x4x1> {
    125   typedef NEONFragmentInt32x4x1 InputType;
    126   typedef NEONFragmentInt32x4x1 OutputType;
    127   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
    128       OutputStage;
    129 
    130   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    131 
    132   OutputType Eval(InputType input, int row, int col) const {
    133     const std::int32_t result_shift = output_stage.result_shift;
    134     const std::int32_t preshift_offset =
    135         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
    136     const int32x4_t result_mult_int =
    137         vld1q_s32(output_stage.result_mult_int.data(col));
    138     const int32x4_t result_offset =
    139         vld1q_s32(output_stage.result_offset.data(row));
    140     const int32x4_t a = vaddq_s32(input, result_offset);
    141     const int32x4_t b =
    142         vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
    143     return vshlq_s32(b, vdupq_n_s32(-result_shift));
    144   }
    145 
    146   const OutputStage& output_stage;
    147 };
    148 
    149 // Implementation of OutputStageSaturatingCastToUint8 for NEONFragmentInt32x4x1
    150 template <>
    151 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
    152                            NEONFragmentInt32x4x1> {
    153   typedef NEONFragmentInt32x4x1 InputType;
    154   typedef NEONFragmentUint8x4x1 OutputType;
    155   typedef OutputStageSaturatingCastToUint8 OutputStage;
    156 
    157   OutputStageEvalImpl(const OutputStage&) {}
    158 
    159   OutputType Eval(InputType input, int, int) const {
    160     int16x8_t q16 = vcombine_s16(vqmovn_s32(input), vdup_n_s16(0));
    161     return vqmovun_s16(q16);
    162   }
    163 };
    164 
    165 // In the case of OutputStageSaturatingCastToUint8, the handling of
    166 // NEONFragmentInt32x16x1 data can be made much more efficient by handling
    167 // it all at once, instead of as 4 separate int32x4 values as in the above
    168 // generic partial specialization. This also avoids the poor (50%) register
    169 // utilization of FragmentUint8x4x1: by handling 16 scalar values at once,
    170 // we are able to fill a uint8x16_t.
    171 template <>
    172 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
    173                            NEONFragmentInt32x16x1> {
    174   typedef NEONFragmentInt32x16x1 InputType;
    175   typedef NEONFragmentUint8x16x1 OutputType;
    176   typedef OutputStageSaturatingCastToUint8 OutputStage;
    177 
    178   OutputStageEvalImpl(const OutputStage&) {}
    179 
    180   OutputType Eval(InputType input, int, int) const {
    181     int16x8_t q16[2];
    182     for (int i = 0; i < 2; i++) {
    183       q16[i] = vcombine_s16(vqmovn_s32(input.data.val[2 * i]),
    184                             vqmovn_s32(input.data.val[2 * i + 1]));
    185     }
    186     return vcombine_u8(vqmovun_s16(q16[0]), vqmovun_s16(q16[1]));
    187   }
    188 };
    189 
    190 // Implementation of OutputStageBiasAddition for NEONFragmentInt32x4x1
    191 template <typename VectorType>
    192 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
    193                            NEONFragmentInt32x4x1> {
    194   typedef NEONFragmentInt32x4x1 InputType;
    195   typedef NEONFragmentInt32x4x1 OutputType;
    196   typedef OutputStageBiasAddition<VectorType> OutputStage;
    197 
    198   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    199 
    200   OutputType Eval(InputType input, int row, int col) const {
    201     int32x4_t bias;
    202     if (VectorType::kShape == VectorShape::Row) {
    203       bias = vdupq_n_s32(output_stage.bias_vector(col));
    204     } else {
    205       bias = vld1q_s32(output_stage.bias_vector.data(row));
    206     }
    207     return vaddq_s32(input, bias);
    208   }
    209 
    210   const OutputStage& output_stage;
    211 };
    212 
    213 // Implementation of OutputStageClamp for NEONFragmentInt32x4x1
    214 template <>
    215 struct OutputStageEvalImpl<OutputStageClamp, NEONFragmentInt32x4x1> {
    216   typedef NEONFragmentInt32x4x1 InputType;
    217   typedef NEONFragmentInt32x4x1 OutputType;
    218   typedef OutputStageClamp OutputStage;
    219 
    220   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    221 
    222   OutputType Eval(InputType input, int, int) const {
    223     const int32x4_t min = vdupq_n_s32(output_stage.min);
    224     const int32x4_t max = vdupq_n_s32(output_stage.max);
    225     return vminq_s32(vmaxq_s32(input, min), max);
    226   }
    227 
    228   const OutputStage& output_stage;
    229 };
    230 
    231 // Implementation of OutputStageTanh for NEONFragmentInt32x4x1
    232 template <>
    233 struct OutputStageEvalImpl<OutputStageTanh, NEONFragmentInt32x4x1>
    234     : OutputStageTanhEvalImpl<NEONFragmentInt32x4x1> {
    235   OutputStageEvalImpl(const OutputStageTanh& output_stage)
    236       : OutputStageTanhEvalImpl(output_stage) {}
    237 };
    238 
    239 // Specialization of StoreFinalOutput for NEONFragmentUint8x4x1.
    240 // This is quite inefficient, but we have no choice: instructions storing 32bit
    241 // at once also assume 32bit alignment. In practice, this slowness is not a
    242 // problem because we use the x16 path for most values.
    243 template <typename DstType>
    244 inline void StoreFinalOutput(NEONFragmentUint8x4x1 value, DstType* dst, int row,
    245                              int col) {
    246   vst1_lane_u8(dst->data(row + 0, col), value, 0);
    247   vst1_lane_u8(dst->data(row + 1, col), value, 1);
    248   vst1_lane_u8(dst->data(row + 2, col), value, 2);
    249   vst1_lane_u8(dst->data(row + 3, col), value, 3);
    250 }
    251 
    252 // Specialization of StoreFinalOutput for NEONFragmentUint8x16x1.
    253 template <typename DstType>
    254 inline void StoreFinalOutput(NEONFragmentUint8x16x1 value, DstType* dst,
    255                              int row, int col) {
    256   vst1q_u8(dst->data(row, col), value);
    257 }
    258 
    259 // Specialization of StoreFinalOutput for NEONFragmentInt32x4x1, storing into a
    260 // int32 destination.
    261 template <typename DstType>
    262 inline void StoreFinalOutput(NEONFragmentInt32x4x1 value, DstType* dst, int row,
    263                              int col) {
    264   vst1q_s32(dst->data(row, col), value);
    265 }
    266 
    267 // Specialization of StoreFinalOutput for NEONFragmentInt32x16x1, storing into
    268 // a int32 destination.
    269 template <typename DstType>
    270 inline void StoreFinalOutput(NEONFragmentInt32x16x1 value, DstType* dst,
    271                              int row, int col) {
    272   for (int i = 0; i < 4; i++) {
    273     vst1q_s32(dst->data(row + 4 * i, col), value.data.val[i]);
    274   }
    275 }
    276 
    277 }  // namespace gemmlowp
    278 
    279 #endif  // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
    280