Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // output.h: processing the 32-bit accumulators output by the unpack
     16 // stage, obtaining the final result matrix entries and storing them into
     17 // the destination matrix.
     18 
     19 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
     20 #define GEMMLOWP_INTERNAL_OUTPUT_H_
     21 
     22 #include <cmath>
     23 #include <tuple>
     24 #include <type_traits>
     25 
     26 #include "../public/output_stages.h"
     27 #include "fixedpoint.h"
     28 
     29 namespace gemmlowp {
     30 
     31 // A Fragment is a small fixed-size matrix typically stored in one or
     32 // a few architecture-specific SIMD vectors. Besides plain old scalar types
     33 // such as int32_t, Fragment types are what can be used as input/output data
     34 // types for output pipeline stages.
     35 //
     36 // More details:
     37 //
     38 // In the generic scalar code in this file, we have only implemented
     39 // evaluation of output stages for scalar inputs (e.g. plain int32_t values).
     40 // Other files (e.g. output_neon.h) are to provide SIMD paths by implementing
     41 // evaluation of output stages for SIMD vector types. However, this raises
     42 // the question of how the different values ("lanes") in a SIMD vector
     43 // correspond to different values in the whole matrices. For simple entry-wise
     44 // output stages, this doesn't matter, but for other output stages depending
     45 // on position within the whole matrix, this does matter. To solve this
     46 // problem, rather than implementing evaluation of output stages for raw
     47 // SIMD vector types, we wrap SIMD vector types in "fragment" structs that
     48 // bring the additional structure of "shape" i.e. mapping SIMD lanes to
     49 // matrix entries, and we specialize evaluation of output stage for such
     50 // fragment types. The Fragment template struct here is how we generate
     51 // all fragment structs. For example, in output_neon.h, it may be specialized
     52 // with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for
     53 // vector shapes. While Fragment is only used for SIMD paths, we leave it
     54 // here in this platform-generic file because this same template should
     55 // cover the needs of any SIMD architectures.
     56 template <typename tDataType, int tRows, int tCols, MapOrder tOrder>
     57 struct Fragment {
     58   typedef tDataType DataType;
     59   static const int kRows = tRows;
     60   static const int kCols = tCols;
     61   static const MapOrder kOrder = tOrder;
     62 
     63   Fragment() {}
     64   Fragment(const DataType& d) : data(d) {}
     65   operator DataType() const { return data; }
     66 
     67   DataType data;
     68 };
     69 
     70 typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1;
     71 typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1;
     72 
     73 // OutputStageEvalImpl is the template that we specialize to provide
     74 // implementations of each output stage for each type of input data.
     75 //
     76 // Each specialization provides a OutputType typedef and an Eval function
     77 // returning OutputType. The OutputType typically depends on the InputType.
     78 //
     79 // There are two dimensions in which input data types can vary:
     80 //   1. Different output stages may expect different data types. The
     81 //      only hard constraint is that the first stage accepts int32, as
     82 //      the unpack stage produces int32 accumulators.
     83 //   2. For a given scalar data type such as int32, there is still the
     84 //      possibility of having SIMD vector types such as NEON int32x4_t,
     85 //      typically wrapped as "fragment" types, see struct Fragment.
     86 //      Thus, there can be several OutputStageEvalImpl
     87 //      specializations for a single OutputStageType, for different
     88 //      InputType's.
     89 template <typename OutputStageType, typename InputType>
     90 struct OutputStageEvalImpl {
     91   // This generic template body should never be hit.
     92   static_assert(
     93       std::is_same<InputType, void>::value,
     94       "Unimplemented: missing implementation of this output pipeline stage "
     95       "for this data type. This would happen if some architecture-specific "
     96       "SIMD back-end (output_$arch.h) were incomplete.");
     97 
     98   OutputStageEvalImpl(const OutputStageType&) {}
     99 };
    100 
    101 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data
    102 template <>
    103 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
    104                            FragmentInt32x1x1> {
    105   typedef FragmentInt32x1x1 InputType;
    106   typedef FragmentInt32x1x1 OutputType;
    107   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
    108 
    109   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    110 
    111   OutputType Eval(InputType input, int, int) const {
    112     const std::int32_t result_shift = output_stage.result_shift;
    113     const std::int32_t result_mult_int = output_stage.result_mult_int;
    114     const std::int32_t result_offset = output_stage.result_offset;
    115     const std::int32_t kRoundingTerm =
    116         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
    117     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
    118            result_shift;
    119   }
    120 
    121   const OutputStage& output_stage;
    122 };
    123 
    124 template <>
    125 struct OutputStageEvalImpl<
    126     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
    127     FragmentInt32x1x1> {
    128   typedef FragmentInt32x1x1 InputType;
    129   typedef FragmentInt32x1x1 OutputType;
    130   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
    131       OutputStage;
    132 
    133   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    134 
    135   OutputType Eval(InputType input, int row, int col) const {
    136     const std::int32_t result_shift = output_stage.result_shift;
    137     const std::int32_t result_mult_int = output_stage.result_mult_int(row);
    138     const std::int32_t result_offset = output_stage.result_offset(row);
    139     const std::int32_t kRoundingTerm =
    140         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
    141     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
    142            result_shift;
    143   }
    144 
    145   const OutputStage& output_stage;
    146 };
    147 
    148 template <>
    149 struct OutputStageEvalImpl<
    150     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
    151     FragmentInt32x1x1> {
    152   typedef FragmentInt32x1x1 InputType;
    153   typedef FragmentInt32x1x1 OutputType;
    154   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
    155       OutputStage;
    156 
    157   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    158 
    159   OutputType Eval(InputType input, int row, int col) const {
    160     const std::int32_t result_shift = output_stage.result_shift;
    161     const std::int32_t result_mult_int = output_stage.result_mult_int(col);
    162     const std::int32_t result_offset = output_stage.result_offset(col);
    163     const std::int32_t kRoundingTerm =
    164         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
    165     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
    166            result_shift;
    167   }
    168 
    169   const OutputStage& output_stage;
    170 };
    171 
    172 // Implementation of OutputStageSaturatingCastToUint8 for scalar data
    173 template <>
    174 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
    175                            FragmentInt32x1x1> {
    176   typedef FragmentInt32x1x1 InputType;
    177   typedef FragmentUint8x1x1 OutputType;
    178   typedef OutputStageSaturatingCastToUint8 OutputStage;
    179 
    180   OutputStageEvalImpl(const OutputStage&) {}
    181 
    182   OutputType Eval(InputType input, int, int) const {
    183     std::int32_t data = input.data;
    184     return data > 255 ? 255 : data < 0 ? 0 : data;
    185   }
    186 };
    187 
    188 // Implementation of OutputStageBiasAddition for scalar data
    189 template <typename VectorType>
    190 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
    191                            FragmentInt32x1x1> {
    192   typedef FragmentInt32x1x1 InputType;
    193   typedef FragmentInt32x1x1 OutputType;
    194   typedef OutputStageBiasAddition<VectorType> OutputStage;
    195 
    196   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    197 
    198   OutputType Eval(InputType input, int row, int col) const {
    199     if (VectorType::kShape == VectorShape::Row) {
    200       return input + output_stage.bias_vector(col);
    201     } else {
    202       return input + output_stage.bias_vector(row);
    203     }
    204   }
    205 
    206   const OutputStage& output_stage;
    207 };
    208 
    209 // Implementation of OutputStageClamp for scalar data
    210 template <>
    211 struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> {
    212   typedef FragmentInt32x1x1 InputType;
    213   typedef FragmentInt32x1x1 OutputType;
    214   typedef OutputStageClamp OutputStage;
    215 
    216   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
    217 
    218   OutputType Eval(InputType input, int, int) const {
    219     const std::int32_t min = output_stage.min;
    220     const std::int32_t max = output_stage.max;
    221     return std::min(std::max(input.data, min), max);
    222   }
    223 
    224   const OutputStage& output_stage;
    225 };
    226 
    227 // Implementation of OutputStageTanh for either scalar or SIMD data
    228 template <typename tInputType>
    229 struct OutputStageTanhEvalImpl {
    230   typedef tInputType InputType;
    231   typedef InputType OutputType;
    232   typedef typename InputType::DataType DataType;
    233   typedef OutputStageTanh OutputStage;
    234 
    235   OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) {
    236     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
    237     const std::int32_t real_amplitude_as_int32 =
    238         output_stage.real_amplitude_as_int32;
    239 
    240     input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
    241     input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
    242     output_min = real_zero_as_int32 - real_amplitude_as_int32;
    243     output_max = real_zero_as_int32 + real_amplitude_as_int32;
    244 
    245     double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
    246     inverse_amplitude_neg_exponent = 0;
    247     while (inverse_amplitude_normalized_double < 0.5) {
    248       inverse_amplitude_normalized_double *= 2;
    249       inverse_amplitude_neg_exponent++;
    250     }
    251     inverse_amplitude_normalized =
    252         ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double);
    253 
    254     double amplitude_normalized_double = real_amplitude_as_int32;
    255     amplitude_exponent = 0;
    256     while (amplitude_normalized_double >= 1.0) {
    257       amplitude_normalized_double *= 0.5;
    258       amplitude_exponent++;
    259     }
    260     amplitude_normalized =
    261         ToFixedPoint<DataType, 0>(amplitude_normalized_double);
    262   }
    263 
    264   OutputType Eval(InputType input, int, int) const {
    265     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
    266 
    267     typedef FixedPoint<DataType, 3> F3;
    268     typedef FixedPoint<DataType, 0> F0;
    269 
    270     // fixed-point affine transformation
    271     DataType input_centered =
    272         Sub(input.data, Dup<DataType>(real_zero_as_int32));
    273     F3 fixedpoint_input =
    274         F3::FromRaw(input_centered) * inverse_amplitude_normalized;
    275     // left shift
    276     fixedpoint_input.raw() =
    277         ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent);
    278     // fixed-point tanh and multiplication
    279     F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
    280     // right shift
    281     DataType int32_output =
    282         Add(Dup<DataType>(real_zero_as_int32),
    283             ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));
    284 
    285     DataType mask_if_below_cutoff_min =
    286         MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min));
    287     DataType mask_if_above_cutoff_max =
    288         MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max));
    289 
    290     return SelectUsingMask(
    291         mask_if_below_cutoff_min, Dup<DataType>(output_min),
    292         SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
    293                         int32_output));
    294   }
    295 
    296   const OutputStage& output_stage;
    297   std::int32_t input_cutoff_min, input_cutoff_max;
    298   std::int32_t output_min, output_max;
    299   FixedPoint<DataType, 0> inverse_amplitude_normalized;
    300   int inverse_amplitude_neg_exponent;
    301   FixedPoint<DataType, 0> amplitude_normalized;
    302   int amplitude_exponent;
    303 };
    304 
    305 template <>
    306 struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1>
    307     : OutputStageTanhEvalImpl<FragmentInt32x1x1> {
    308   OutputStageEvalImpl(const OutputStageTanh& output_stage)
    309       : OutputStageTanhEvalImpl(output_stage) {}
    310 };
    311 
    312 // OutputPipelineOutputType is a helper to determine the output data type of a
    313 // pipeline, for a
    314 // given input data type. It is a recursive template; see the explanation on
    315 // OutputPipelineEvalImpl below.
    316 template <typename OutputPipelineType, int FirstStage, typename InputType,
    317           bool StopRecursion =
    318               FirstStage == std::tuple_size<OutputPipelineType>::value>
    319 struct OutputPipelineOutputType {
    320   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
    321       FirstStageType;
    322   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
    323       FirstStageOutputType;
    324   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
    325                                             FirstStageOutputType>::Type Type;
    326 };
    327 
    328 template <typename OutputPipelineType, int FirstStage, typename InputType>
    329 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
    330                                 true> {
    331   typedef InputType Type;
    332 };
    333 
    334 // OutputPipelineEvalImpl is a helper to implement the evaluation of
    335 // the whole pipeline. It is a recursive template to implement compile-time
    336 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
    337 // is how we implement recursion: each specialization implements only
    338 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
    339 // helper to implement the termination of the recursion as a partial
    340 // specialization below.
    341 template <typename OutputPipelineType, int FirstStage, typename InputType,
    342           bool StopRecursion =
    343               FirstStage == std::tuple_size<OutputPipelineType>::value>
    344 struct OutputPipelineEvalImpl {
    345   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
    346       FirstStageType;
    347   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
    348       FirstStageOutputType;
    349   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
    350                                             InputType>::Type OutputType;
    351 
    352   OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
    353       : head_impl(std::get<FirstStage>(output_pipeline)),
    354         tail_impl(output_pipeline) {}
    355 
    356   OutputType Eval(InputType input, int row, int col) const {
    357     // Evaluate the first stage.
    358     FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
    359     // Recurse into the remaining stages.
    360     return tail_impl.Eval(first_stage_output, row, col);
    361   }
    362 
    363   const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
    364   const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
    365                                FirstStageOutputType>
    366       tail_impl;
    367 };
    368 
    369 // Specialization on 'StopRecursion' for terminating the recursion.
    370 template <typename OutputPipelineType, int FirstStage, typename InputType>
    371 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
    372   OutputPipelineEvalImpl(const OutputPipelineType&) {}
    373 
    374   InputType Eval(InputType input, int, int) const {
    375     // Terminating the recursion.
    376     return input;
    377   }
    378 };
    379 
    380 // StoreFinalOutput takes the final value at the end of the output pipeline and
    381 // stores it into the destination matrix. It can be specialized for different
    382 // data types; the generic implementation here is typically used only for plain
    383 // old scalar (not SIMD) types.
    384 template <typename OutputType, typename DstType>
    385 void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) {
    386   *dst->data(row, col) = value;
    387 }
    388 
    389 template <typename OutputPipelineType, typename InputType>
    390 struct OutputPipelineExecutor {
    391   OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
    392       : output_pipeline_eval_impl_(output_pipeline) {}
    393 
    394   // RunOutputPipeline is the entry point into the output pipeline evaluation
    395   // code. It should be the only thing that unpack code calls. It takes the
    396   // result
    397   // of the unpack stage and stores it into the destination matrix.
    398   template <typename DstType>
    399   void Execute(InputType input, DstType* dst, int row, int col) {
    400     // Statically assert that the output pipeline matches the given destination
    401     // matrix's scalar type.
    402     typedef typename OutputPipelineOutputType<OutputPipelineType, 0,
    403                                               FragmentInt32x1x1>::Type::DataType
    404         ScalarOutputType;
    405     typedef typename DstType::Scalar ScalarDstType;
    406     static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
    407                   "mismatched destination scalar type and output pipeline");
    408 
    409     // Evaluate the output pipeline.
    410     auto output = output_pipeline_eval_impl_.Eval(input, row, col);
    411     // Store the result into the destination matrix.
    412     StoreFinalOutput(output, dst, row, col);
    413   }
    414 
    415   const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
    416       output_pipeline_eval_impl_;
    417 };
    418 
    419 }  // namespace gemmlowp
    420 
    421 #endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
    422