1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // output_stages.h: public definitions of the output stages that can 16 // be assembled into an output pipeline, to control how internal 17 // 32-bit accumulators are transformed to obtain the final uint8 18 // result matrix entries. 19 20 #ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ 21 #define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ 22 23 #include <tuple> 24 25 #include "../internal/common.h" 26 27 namespace gemmlowp { 28 29 // This output stage takes int32 values and returns still int32 values, 30 // but "quantized down" to the uint8 scale; in other words, its output 31 // is typically what one would then clamp to [0..255] and cast to uint8 32 // (see OutputStageSaturatingCastToUint8). 33 // 34 // This "quantization down" process depends on 3 parameters, 35 // result_offset, result_mult_int, result_shift, 36 // and the result is: 37 // ((input + result_offset) * result_mult_int + rounding) >> result_shift 38 // where 39 // rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 40 struct OutputStageQuantizeDownInt32ToUint8Scale { 41 std::int32_t result_offset; 42 std::int32_t result_mult_int; 43 std::int32_t result_shift; 44 }; 45 46 // This output stage takes int32 values and returns still int32 values, 47 // but "quantized down" to the uint8 scale; in other words, its output 48 // is typically what one would then clamp to [0..255] and cast to uint8 49 // (see OutputStageSaturatingCastToUint8). 50 // 51 // This "quantization down" process depends on 3 parameters, 52 // result_offset, result_mult_int, result_shift, 53 // and the result is: 54 // ((input + result_offset) * result_mult_int + rounding) >> result_shift 55 // where 56 // rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 57 // 58 // Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each 59 // row or column of the output (depending on tShape) has its own result_offset 60 // and result_mult_int numbers. 61 template <VectorShape tShape> 62 struct OutputStageQuantizeDownInt32ToUint8ScalePC { 63 VectorMap<const std::int32_t, tShape> result_offset; 64 VectorMap<const std::int32_t, tShape> result_mult_int; 65 std::int32_t result_shift; 66 }; 67 68 // This output stage takes int32 values and returns still int32 values, 69 // but "quantized down" to a difference scale; for example, in a pipeline 70 // that outputs uint8 values in [0..255], the output of this stage would be 71 // int32 values ready to be clamped to [0..255] and casted to uint8 72 // (see OutputStageSaturatingCastToUint8). 73 // 74 // This "quantization down" process depends on 3 parameters, 75 // result_offset, result_fixedpoint_multiplier, result_shift, 76 // and the result is: 77 // ((FixedPointMul(input, result_fixedpoint_multiplier) + 78 // rounding) >> result_shift) + result_offset_after_shift 79 // where 80 // rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 81 // and where FixedPointMul(x, y) is the nearest integer to the following 82 // mathematical expression, evaluated without overflow or intermediate 83 // rounding: 84 // (x * y) / 2^31 85 // In practice, it is expected that FixedPointMul will be implemented 86 // using hardware "rounding doubling int32 multiply high" instructions, 87 // such as VQRDMULH on ARM. See in fixedpoint.h the generic function, 88 // SaturatingRoundingDoublingHighMul. 89 // 90 // Notice that the other difference from 91 // OutputStageQuantizeDownInt32ToUint8Scale is that the result offset 92 // is applied after the multiplier and shift, not before. This ensures 93 // that no matter what the multiplier and shift are, the result offset 94 // is effectively integral: offsetting the final result by an integer. 95 // The motivation for this is to faithfully support quantization schemes 96 // where the formula linking quantized values to the real mathematical 97 // values that they represent, is of the form 98 // 99 // real_value = scale * (quantized_value - zero_point) 100 // 101 // where scale is a real number (represented in quantized form by 102 // result_fixedpoint_multiplier and result_shift) and zero_point 103 // is an integer telling which quantized value correspond to the 104 // real value 0, and is represented here by (the opposite of) 105 // result_offset_after_shift. 106 // The motivation for such a quantization scheme, designed to 107 // ensure that 0 is always a representable value, is that in 108 // many applications, we need to 0-pad arrays and that can only be 109 // done for quantized arrays if 0 is a representable value in 110 // quantized form. In particular, convolution-like operations 111 // are often implemented using 0-padding, or "im2col"-like 112 // expansions that implicitly rely on 0-padding. If 0 were not 113 // a representable value, such operations would have to pad 114 // using a nonzero value, introducing bias in the computation. 115 struct OutputStageQuantizeDownInt32ByFixedPoint { 116 std::int32_t result_fixedpoint_multiplier; 117 std::int32_t result_shift; 118 std::int32_t result_offset_after_shift; 119 }; 120 121 // OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint is the old deprecated 122 // name of OutputStageQuantizeDownInt32ByFixedPoint, before we noticed that 123 // there really wasn't anything Uint8-specific about it. 124 using OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint = OutputStageQuantizeDownInt32ByFixedPoint; 125 126 // Variant of OutputStageQuantizeDownInt32ByFixedPoint where the 'shift' 127 // is not necessarily just a right shift, so we can represent multipliers 128 // greater than 1. This takes an result_exponent parameter; when it's 129 // <= 0, this is equivalent to OutputStageQuantizeDownInt32ByFixedPoint 130 // with result_shift = -result_exponent. 131 // In the general case, this consists in first left-shifting by 132 // std::max(result_exponent, 0), before doing the same as 133 // OutputStageQuantizeDownInt32ByFixedPoint with 134 // result_shift = std::max(-result_exponent, 0). 135 struct OutputStageScaleInt32ByFixedPointAndExponent { 136 std::int32_t result_fixedpoint_multiplier; 137 std::int32_t result_exponent; 138 std::int32_t result_offset_after_shift; 139 }; 140 141 // This output stage takes int32 values that are expected to be already 142 // on the final uint8 scale, but not necessarily in the [0..255] range. 143 // It clamps them to the [0..255] range and returns them casted to uint8. 144 struct OutputStageSaturatingCastToUint8 {}; 145 146 // This output stage takes int32 values that are expected to be already 147 // on the final int16 scale, but not necessarily in the [-32768..32767] range. 148 // It clamps them to the [-32768..32767] range and returns them casted to int16. 149 struct OutputStageSaturatingCastToInt16 {}; 150 151 // This output stage depends on a "bias vector" that should contain int32 152 // entries, and be either a row-vector of the same number of columns as the 153 // result matrix, or a column-vector of the same number of rows as the 154 // result matrix. This output stage takes int32 values and adds to them 155 // the corresponding entry of the bias vector (broadcasted in the other 156 // direction to fit the matrix's shape), outputting int32 values. 157 template <typename VectorType> 158 struct OutputStageBiasAddition { 159 VectorType bias_vector; 160 }; 161 162 // This output stage clamps value between the specified min and max bounds. 163 // It can be used to implement "rectified linear unit" activation functions 164 // in neural networks. 165 struct OutputStageClamp { 166 std::int32_t min; 167 std::int32_t max; 168 }; 169 170 struct OutputStageTanh { 171 std::int32_t real_zero_as_int32; 172 std::int32_t real_amplitude_as_int32; 173 }; 174 175 // An output pipeline is just a std::tuple of output stages. 176 // This function generates a standard output pipeline consisting of two stages: 177 // OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8. 178 inline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale, 179 OutputStageSaturatingCastToUint8> 180 MakeStandardOutputPipeline(std::int32_t result_offset, 181 std::int32_t result_mult_int, 182 std::int32_t result_shift) { 183 OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage; 184 quantize_down_stage.result_offset = result_offset; 185 quantize_down_stage.result_mult_int = result_mult_int; 186 quantize_down_stage.result_shift = result_shift; 187 OutputStageSaturatingCastToUint8 saturating_cast_stage; 188 return std::make_tuple(quantize_down_stage, saturating_cast_stage); 189 } 190 191 // An output pipeline is just a std::tuple of output stages. 192 // This function generates a standard output pipeline consisting of two stages: 193 // OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8. 194 template <VectorShape tShape> 195 inline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>, 196 OutputStageSaturatingCastToUint8> 197 MakeStandardOutputPipeline( 198 const VectorMap<const std::int32_t, tShape>& result_offset, 199 const VectorMap<const std::int32_t, tShape>& result_mult_int, 200 std::int32_t result_shift) { 201 OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage; 202 quantize_down_stage.result_offset = result_offset; 203 quantize_down_stage.result_mult_int = result_mult_int; 204 quantize_down_stage.result_shift = result_shift; 205 OutputStageSaturatingCastToUint8 saturating_cast_stage; 206 return std::make_tuple(quantize_down_stage, saturating_cast_stage); 207 } 208 209 } // namespace gemmlowp 210 211 #endif // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ 212