gemmlowp/public/output_stages.h

// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// output_stages.h: public definitions of the output stages that can
// be assembled into an output pipeline, to control how internal
// 32-bit accumulators are transformed to obtain the final uint8
// result matrix entries.

#ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
#define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_

#include <tuple>

#include "../internal/common.h"

namespace gemmlowp {

// This output stage takes int32 values and returns still int32 values,
// but "quantized down" to the uint8 scale; in other words, its output
// is typically what one would then clamp to [0..255] and cast to uint8
// (see OutputStageSaturatingCastToUint8).
//
// This "quantization down" process depends on 3 parameters,
//   result_offset, result_mult_int, result_shift,
// and the result is:
//   ((input + result_offset) * result_mult_int + rounding) >> result_shift
// where
//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
struct OutputStageQuantizeDownInt32ToUint8Scale {
  std::int32_t result_offset;
  std::int32_t result_mult_int;
  std::int32_t result_shift;
};

// This output stage takes int32 values and returns still int32 values,
// but "quantized down" to the uint8 scale; in other words, its output
// is typically what one would then clamp to [0..255] and cast to uint8
// (see OutputStageSaturatingCastToUint8).
//
// This "quantization down" process depends on 3 parameters,
//   result_offset, result_mult_int, result_shift,
// and the result is:
//   ((input + result_offset) * result_mult_int + rounding) >> result_shift
// where
//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
//
// Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each
// row or column of the output (depending on tShape) has its own result_offset
// and result_mult_int numbers.
template <VectorShape tShape>
struct OutputStageQuantizeDownInt32ToUint8ScalePC {
  VectorMap<const std::int32_t, tShape> result_offset;
  VectorMap<const std::int32_t, tShape> result_mult_int;
  std::int32_t result_shift;
};

// This output stage takes int32 values and returns still int32 values,
// but "quantized down" to a difference scale; for example, in a pipeline
// that outputs uint8 values in [0..255], the output of this stage would be
// int32 values ready to be clamped to [0..255] and casted to uint8
// (see OutputStageSaturatingCastToUint8).
//
// This "quantization down" process depends on 3 parameters,
//   result_offset, result_fixedpoint_multiplier, result_shift,
// and the result is:
//   ((FixedPointMul(input, result_fixedpoint_multiplier) +
//   rounding) >> result_shift) + result_offset_after_shift
// where
//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
// and where FixedPointMul(x, y) is the nearest integer to the following
// mathematical expression, evaluated without overflow or intermediate
// rounding:
//   (x * y) / 2^31
// In practice, it is expected that FixedPointMul will be implemented
// using hardware "rounding doubling int32 multiply high" instructions,
// such as VQRDMULH on ARM. See in fixedpoint.h the generic function,
// SaturatingRoundingDoublingHighMul.
//
// Notice that the other difference from
// OutputStageQuantizeDownInt32ToUint8Scale is that the result offset
// is applied after the multiplier and shift, not before. This ensures
// that no matter what the multiplier and shift are, the result offset
// is effectively integral: offsetting the final result by an integer.
// The motivation for this is to faithfully support quantization schemes
// where the formula linking quantized values to the real mathematical
// values that they represent, is of the form
//
//   real_value = scale * (quantized_value - zero_point)
//
// where scale is a real number (represented in quantized form by
// result_fixedpoint_multiplier and result_shift) and zero_point
// is an integer telling which quantized value correspond to the
// real value 0, and is represented here by (the opposite of)
// result_offset_after_shift.
// The motivation for such a quantization scheme, designed to
// ensure that 0 is always a representable value, is that in
// many applications, we need to 0-pad arrays and that can only be
// done for quantized arrays if 0 is a representable value in
// quantized form. In particular, convolution-like operations
// are often implemented using 0-padding, or "im2col"-like
// expansions that implicitly rely on 0-padding. If 0 were not
// a representable value, such operations would have to pad
// using a nonzero value, introducing bias in the computation.
struct OutputStageQuantizeDownInt32ByFixedPoint {
  std::int32_t result_fixedpoint_multiplier;
  std::int32_t result_shift;
  std::int32_t result_offset_after_shift;
};

// OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint is the old deprecated
// name of OutputStageQuantizeDownInt32ByFixedPoint, before we noticed that
// there really wasn't anything Uint8-specific about it.
using OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint = OutputStageQuantizeDownInt32ByFixedPoint;

// Variant of OutputStageQuantizeDownInt32ByFixedPoint where the 'shift'
// is not necessarily just a right shift, so we can represent multipliers
// greater than 1. This takes an result_exponent parameter; when it's
// <= 0, this is equivalent to OutputStageQuantizeDownInt32ByFixedPoint
// with result_shift = -result_exponent.
// In the general case, this consists in first left-shifting by
// std::max(result_exponent, 0), before doing the same as
// OutputStageQuantizeDownInt32ByFixedPoint with
// result_shift = std::max(-result_exponent, 0).
struct OutputStageScaleInt32ByFixedPointAndExponent {
  std::int32_t result_fixedpoint_multiplier;
  std::int32_t result_exponent;
  std::int32_t result_offset_after_shift;
};

// This output stage takes int32 values that are expected to be already
// on the final uint8 scale, but not necessarily in the [0..255] range.
// It clamps them to the [0..255] range and returns them casted to uint8.
struct OutputStageSaturatingCastToUint8 {};

// This output stage takes int32 values that are expected to be already
// on the final int16 scale, but not necessarily in the [-32768..32767] range.
// It clamps them to the [-32768..32767] range and returns them casted to int16.
struct OutputStageSaturatingCastToInt16 {};

// This output stage depends on a "bias vector" that should contain int32
// entries, and be either a row-vector of the same number of columns as the
// result matrix, or a column-vector of the same number of rows as the
// result matrix. This output stage takes int32 values and adds to them
// the corresponding entry of the bias vector (broadcasted in the other
// direction to fit the matrix's shape), outputting int32 values.
template <typename VectorType>
struct OutputStageBiasAddition {
  VectorType bias_vector;
};

// This output stage clamps value between the specified min and max bounds.
// It can be used to implement "rectified linear unit" activation functions
// in neural networks.
struct OutputStageClamp {
  std::int32_t min;
  std::int32_t max;
};

struct OutputStageTanh {
  std::int32_t real_zero_as_int32;
  std::int32_t real_amplitude_as_int32;
};

// An output pipeline is just a std::tuple of output stages.
// This function generates a standard output pipeline consisting of two stages:
// OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8.
inline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale,
                  OutputStageSaturatingCastToUint8>
MakeStandardOutputPipeline(std::int32_t result_offset,
                           std::int32_t result_mult_int,
                           std::int32_t result_shift) {
  OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage;
  quantize_down_stage.result_offset = result_offset;
  quantize_down_stage.result_mult_int = result_mult_int;
  quantize_down_stage.result_shift = result_shift;
  OutputStageSaturatingCastToUint8 saturating_cast_stage;
  return std::make_tuple(quantize_down_stage, saturating_cast_stage);
}

// An output pipeline is just a std::tuple of output stages.
// This function generates a standard output pipeline consisting of two stages:
// OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8.
template <VectorShape tShape>
inline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>,
                  OutputStageSaturatingCastToUint8>
MakeStandardOutputPipeline(
    const VectorMap<const std::int32_t, tShape>& result_offset,
    const VectorMap<const std::int32_t, tShape>& result_mult_int,
    std::int32_t result_shift) {
  OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage;
  quantize_down_stage.result_offset = result_offset;
  quantize_down_stage.result_mult_int = result_mult_int;
  quantize_down_stage.result_shift = result_shift;
  OutputStageSaturatingCastToUint8 saturating_cast_stage;
  return std::make_tuple(quantize_down_stage, saturating_cast_stage);
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_