Home | History | Annotate | Download | only in public
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // output_stages.h: public definitions of the output stages that can
     16 // be assembled into an output pipeline, to control how internal
     17 // 32-bit accumulators are transformed to obtain the final uint8
     18 // result matrix entries.
     19 
     20 #ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
     21 #define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
     22 
     23 #include <tuple>
     24 
     25 #include "../internal/common.h"
     26 
     27 namespace gemmlowp {
     28 
     29 // This output stage takes int32 values and returns still int32 values,
     30 // but "quantized down" to the uint8 scale; in other words, its output
     31 // is typically what one would then clamp to [0..255] and cast to uint8
     32 // (see OutputStageSaturatingCastToUint8).
     33 //
     34 // This "quantization down" process depends on 3 parameters,
     35 //   result_offset, result_mult_int, result_shift,
     36 // and the result is:
     37 //   ((input + result_offset) * result_mult_int + rounding) >> result_shift
     38 // where
     39 //   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     40 struct OutputStageQuantizeDownInt32ToUint8Scale {
     41   std::int32_t result_offset;
     42   std::int32_t result_mult_int;
     43   std::int32_t result_shift;
     44 };
     45 
     46 // This output stage takes int32 values and returns still int32 values,
     47 // but "quantized down" to the uint8 scale; in other words, its output
     48 // is typically what one would then clamp to [0..255] and cast to uint8
     49 // (see OutputStageSaturatingCastToUint8).
     50 //
     51 // This "quantization down" process depends on 3 parameters,
     52 //   result_offset, result_mult_int, result_shift,
     53 // and the result is:
     54 //   ((input + result_offset) * result_mult_int + rounding) >> result_shift
     55 // where
     56 //   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     57 //
     58 // Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each
     59 // row or column of the output (depending on tShape) has its own result_offset
     60 // and result_mult_int numbers.
     61 template <VectorShape tShape>
     62 struct OutputStageQuantizeDownInt32ToUint8ScalePC {
     63   VectorMap<const std::int32_t, tShape> result_offset;
     64   VectorMap<const std::int32_t, tShape> result_mult_int;
     65   std::int32_t result_shift;
     66 };
     67 
     68 // This output stage takes int32 values and returns still int32 values,
     69 // but "quantized down" to a difference scale; for example, in a pipeline
     70 // that outputs uint8 values in [0..255], the output of this stage would be
     71 // int32 values ready to be clamped to [0..255] and casted to uint8
     72 // (see OutputStageSaturatingCastToUint8).
     73 //
     74 // This "quantization down" process depends on 3 parameters,
     75 //   result_offset, result_fixedpoint_multiplier, result_shift,
     76 // and the result is:
     77 //   ((FixedPointMul(input, result_fixedpoint_multiplier) +
     78 //   rounding) >> result_shift) + result_offset_after_shift
     79 // where
     80 //   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
     81 // and where FixedPointMul(x, y) is the nearest integer to the following
     82 // mathematical expression, evaluated without overflow or intermediate
     83 // rounding:
     84 //   (x * y) / 2^31
     85 // In practice, it is expected that FixedPointMul will be implemented
     86 // using hardware "rounding doubling int32 multiply high" instructions,
     87 // such as VQRDMULH on ARM. See in fixedpoint.h the generic function,
     88 // SaturatingRoundingDoublingHighMul.
     89 //
     90 // Notice that the other difference from
     91 // OutputStageQuantizeDownInt32ToUint8Scale is that the result offset
     92 // is applied after the multiplier and shift, not before. This ensures
     93 // that no matter what the multiplier and shift are, the result offset
     94 // is effectively integral: offsetting the final result by an integer.
     95 // The motivation for this is to faithfully support quantization schemes
     96 // where the formula linking quantized values to the real mathematical
     97 // values that they represent, is of the form
     98 //
     99 //   real_value = scale * (quantized_value - zero_point)
    100 //
    101 // where scale is a real number (represented in quantized form by
    102 // result_fixedpoint_multiplier and result_shift) and zero_point
    103 // is an integer telling which quantized value correspond to the
    104 // real value 0, and is represented here by (the opposite of)
    105 // result_offset_after_shift.
    106 // The motivation for such a quantization scheme, designed to
    107 // ensure that 0 is always a representable value, is that in
    108 // many applications, we need to 0-pad arrays and that can only be
    109 // done for quantized arrays if 0 is a representable value in
    110 // quantized form. In particular, convolution-like operations
    111 // are often implemented using 0-padding, or "im2col"-like
    112 // expansions that implicitly rely on 0-padding. If 0 were not
    113 // a representable value, such operations would have to pad
    114 // using a nonzero value, introducing bias in the computation.
    115 struct OutputStageQuantizeDownInt32ByFixedPoint {
    116   std::int32_t result_fixedpoint_multiplier;
    117   std::int32_t result_shift;
    118   std::int32_t result_offset_after_shift;
    119 };
    120 
    121 // OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint is the old deprecated
    122 // name of OutputStageQuantizeDownInt32ByFixedPoint, before we noticed that
    123 // there really wasn't anything Uint8-specific about it.
    124 using OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint = OutputStageQuantizeDownInt32ByFixedPoint;
    125 
    126 // Variant of OutputStageQuantizeDownInt32ByFixedPoint where the 'shift'
    127 // is not necessarily just a right shift, so we can represent multipliers
    128 // greater than 1. This takes an result_exponent parameter; when it's
    129 // <= 0, this is equivalent to OutputStageQuantizeDownInt32ByFixedPoint
    130 // with result_shift = -result_exponent.
    131 // In the general case, this consists in first left-shifting by
    132 // std::max(result_exponent, 0), before doing the same as
    133 // OutputStageQuantizeDownInt32ByFixedPoint with
    134 // result_shift = std::max(-result_exponent, 0).
    135 struct OutputStageScaleInt32ByFixedPointAndExponent {
    136   std::int32_t result_fixedpoint_multiplier;
    137   std::int32_t result_exponent;
    138   std::int32_t result_offset_after_shift;
    139 };
    140 
    141 // This output stage takes int32 values that are expected to be already
    142 // on the final uint8 scale, but not necessarily in the [0..255] range.
    143 // It clamps them to the [0..255] range and returns them casted to uint8.
    144 struct OutputStageSaturatingCastToUint8 {};
    145 
    146 // This output stage takes int32 values that are expected to be already
    147 // on the final int16 scale, but not necessarily in the [-32768..32767] range.
    148 // It clamps them to the [-32768..32767] range and returns them casted to int16.
    149 struct OutputStageSaturatingCastToInt16 {};
    150 
    151 // This output stage depends on a "bias vector" that should contain int32
    152 // entries, and be either a row-vector of the same number of columns as the
    153 // result matrix, or a column-vector of the same number of rows as the
    154 // result matrix. This output stage takes int32 values and adds to them
    155 // the corresponding entry of the bias vector (broadcasted in the other
    156 // direction to fit the matrix's shape), outputting int32 values.
    157 template <typename VectorType>
    158 struct OutputStageBiasAddition {
    159   VectorType bias_vector;
    160 };
    161 
    162 // This output stage clamps value between the specified min and max bounds.
    163 // It can be used to implement "rectified linear unit" activation functions
    164 // in neural networks.
    165 struct OutputStageClamp {
    166   std::int32_t min;
    167   std::int32_t max;
    168 };
    169 
    170 struct OutputStageTanh {
    171   std::int32_t real_zero_as_int32;
    172   std::int32_t real_amplitude_as_int32;
    173 };
    174 
    175 // An output pipeline is just a std::tuple of output stages.
    176 // This function generates a standard output pipeline consisting of two stages:
    177 // OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8.
    178 inline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale,
    179                   OutputStageSaturatingCastToUint8>
    180 MakeStandardOutputPipeline(std::int32_t result_offset,
    181                            std::int32_t result_mult_int,
    182                            std::int32_t result_shift) {
    183   OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage;
    184   quantize_down_stage.result_offset = result_offset;
    185   quantize_down_stage.result_mult_int = result_mult_int;
    186   quantize_down_stage.result_shift = result_shift;
    187   OutputStageSaturatingCastToUint8 saturating_cast_stage;
    188   return std::make_tuple(quantize_down_stage, saturating_cast_stage);
    189 }
    190 
    191 // An output pipeline is just a std::tuple of output stages.
    192 // This function generates a standard output pipeline consisting of two stages:
    193 // OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8.
    194 template <VectorShape tShape>
    195 inline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>,
    196                   OutputStageSaturatingCastToUint8>
    197 MakeStandardOutputPipeline(
    198     const VectorMap<const std::int32_t, tShape>& result_offset,
    199     const VectorMap<const std::int32_t, tShape>& result_mult_int,
    200     std::int32_t result_shift) {
    201   OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage;
    202   quantize_down_stage.result_offset = result_offset;
    203   quantize_down_stage.result_mult_int = result_mult_int;
    204   quantize_down_stage.result_shift = result_shift;
    205   OutputStageSaturatingCastToUint8 saturating_cast_stage;
    206   return std::make_tuple(quantize_down_stage, saturating_cast_stage);
    207 }
    208 
    209 }  // namespace gemmlowp
    210 
    211 #endif  // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
    212