core/kernels/quantization_utils.h

/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
#define TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_

#define EIGEN_USE_THREADS

// This is a set of functions that standardizes how quantized values are
// interpreted as float numbers.
// All of the current implementations are for reference and have not been
// optimized. They should be implementable using fixed point representations
// to avoid a dependency on floating-point hardware.

#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#define QUANTIZATION_UTILS_USE_NEON
#include <arm_neon.h>
#endif

#include <array>

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
#include "public/gemmlowp.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/lib/core/threadpool.h"

namespace tensorflow {

// We have to be able to detect and handle overflows in int32, so this function
// uses doubles and int64's to make sure we have enough room.
template <class T>
int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
  const int64 lowest_quantized =
      static_cast<double>(Eigen::NumTraits<T>::lowest());
  if (range_min == range_max) {
    return lowest_quantized;
  }
  const int number_of_bits = sizeof(T) * 8;
  const int64 number_of_steps = static_cast<int64>(1) << number_of_bits;
  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
  const double range = ((range_max - range_min) * range_adjust);
  const double range_scale = (number_of_steps / range);
  int64 quantized =
      (round(input * range_scale) - round(range_min * range_scale));
  quantized += lowest_quantized;
  return quantized;
}

// This converts the float into the final quantized type, clamping/saturating
// any over or underflows.
template <class T>
T FloatToQuantized(float input, float range_min, float range_max) {
  if (std::is_same<T, float>::value) {
    // Specialization for float. This is used in reference implementation
    // for float which is useful to compare performance between float
    // and quantized type.
    return input;
  }
  int64 quantized = FloatToQuantizedUnclamped<T>(input, range_min, range_max);
  const int64 lowest_quantized =
      static_cast<int64>(Eigen::NumTraits<T>::lowest());
  const int64 highest_quantized =
      static_cast<int64>(Eigen::NumTraits<T>::highest());
  quantized = std::max(quantized, lowest_quantized);
  quantized = std::min(quantized, highest_quantized);
  return static_cast<T>(static_cast<int32>(quantized));
}

template <class T>
float QuantizedToFloat(T input, float range_min, float range_max) {
  if (std::is_same<T, float>::value) {
    // Specialization for float. This is used in reference implementation
    // for float which is useful to compare performance between float
    // and quantized type.
    return input;
  }
  if (range_min == range_max) {
    return range_min;
  }
  const int number_of_bits = sizeof(T) * 8;
  const int64 number_of_steps = static_cast<int64>(1) << number_of_bits;
  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
  const double range = ((range_max - range_min) * range_adjust);
  const double range_scale = (range / number_of_steps);
  const int64 lowest_quantized =
      static_cast<int64>(Eigen::NumTraits<T>::lowest());
  const double offset_input = static_cast<double>(input) - lowest_quantized;
  // For compatibility with DEQUANTIZE_WITH_EIGEN, we should convert
  // range_scale to a float, otherwise range_min_rounded might be slightly
  // different.
  const double range_min_rounded =
      round(range_min / static_cast<float>(range_scale)) *
      static_cast<float>(range_scale);
  const double result = range_min_rounded + (offset_input * range_scale);
  return static_cast<float>(result);
}

template <class T>
float FloatForOneQuantizedLevel(float range_min, float range_max) {
  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
  const float float_for_one_quantized_level =
      (range_max - range_min) / (highest - lowest);
  return float_for_one_quantized_level;
}

template <class T1, class T2, class T3>
void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
                                        float max_b, float* min_c,
                                        float* max_c) {
  const float a_float_for_one_quant_level =
      FloatForOneQuantizedLevel<T1>(min_a, max_a);
  const float b_float_for_one_quant_level =
      FloatForOneQuantizedLevel<T2>(min_b, max_b);

  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
  const float c_float_for_one_quant_level =
      a_float_for_one_quant_level * b_float_for_one_quant_level;

  *min_c = c_float_for_one_quant_level * c_lowest;
  *max_c = c_float_for_one_quant_level * c_highest;
}

// input_array is an eigen Tensor.  q2f is a QuantizedToFloatStruct.
// This evaluates to an eigen tensor expression, to be used like:
// auto tensor = DEQUANTIZE_WITH_EIGEN(input_tensor, q2f);
#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                         \
  ((q2f.range_min_rounded - q2f.lowest_quantized() * q2f.range_scale) + \
   input_array.template cast<float>() * q2f.range_scale)

// input_array is an eigen Tensor.  f2q is a FloatToQuantizedStruct.
// OutputType is the type of output (e.g. quint8).
// This evaluates to an eigen tensor expression, to be used like:
// auto tensor = QUANTIZE_WITH_EIGEN(input_tensor, f2q, T);
#define QUANTIZE_WITH_EIGEN(input_array, f2q, OutputType) \
  ((input_array * f2q.range_scale).round() -              \
   (f2q.range_min_scaled - f2q.lowest_quantized()))       \
      .cwiseMax(f2q.lower_bound_float())                  \
      .cwiseMin(f2q.upper_bound_float())                  \
      .template cast<int32>()                             \
      .template cast<OutputType>()

// For use with DEQUANTIZE_WITH_EIGEN.
template <typename T>
struct QuantizedToFloatStruct {
  static constexpr int number_of_bits = sizeof(T) * 8;
  static constexpr int64 number_of_steps = static_cast<int64>(1)
                                           << number_of_bits;

  static float lowest_quantized() {
    return static_cast<float>(Eigen::NumTraits<T>::lowest());
  }

  QuantizedToFloatStruct(float range_min, float range_max)
      : range_min(range_min),
        range_scale((range_max - range_min) / (number_of_steps - 1.0)),
        range_min_rounded(range_max == range_min
                              ? range_min
                              : round(range_min / range_scale) * range_scale) {}

  const float range_min;
  const float range_scale;
  const float range_min_rounded;
};

// For use with QUANTIZE_WITH_EIGEN.
template <typename T>
struct FloatToQuantizedStruct {
  static constexpr int number_of_bits = sizeof(T) * 8;
  static constexpr int64 number_of_steps = static_cast<int64>(1)
                                           << number_of_bits;
  static constexpr double range_adjust =
      (number_of_steps / (number_of_steps - 1.0));

  // Casting QInt32's lowest or highest to a float gives a float that can't be
  // cast back to int32 or QInt32.  Instead, use bounds that can be converted
  // back to int32 without going outside the range of an int32.
  static float lower_bound_float() {
    return Eigen::numext::maxi(
        static_cast<float>(Eigen::NumTraits<T>::lowest()), -2.147483648e+09f);
  }
  static float upper_bound_float() {
    return Eigen::numext::mini(
        static_cast<float>(Eigen::NumTraits<T>::highest()), +2.147483520e+09f);
  }

  static float lowest_quantized() {
    return static_cast<float>(Eigen::NumTraits<T>::lowest());
  }

  FloatToQuantizedStruct(float range_min, float range_max)
      : range_min(range_min),
        range_scale(range_max == range_min
                        ? 0.0
                        : (number_of_steps - 1.0) / (range_max - range_min)),
        range_min_scaled(round(range_min * range_scale)) {}

  const float range_min;
  const float range_scale;
  const float range_min_scaled;
};

template <class T1, class T2>
inline T2 RequantizeInNewRange(T1 input, float min_input, float max_input,
                               float min_new, float max_new) {
  const float input_float = QuantizedToFloat<T1>(input, min_input, max_input);
  return FloatToQuantized<T2>(input_float, min_new, max_new);
}

template <class T1, class T2>
inline void RequantizeManyInNewRange(const T1* input, int64 count,
                                     float min_input, float max_input,
                                     float min_output, float max_output,
                                     T2* output) {
  for (size_t index = 0; index < count; ++index) {
    const float input_float =
        QuantizedToFloat<T1>(input[index], min_input, max_input);
    output[index] = FloatToQuantized<T2>(input_float, min_output, max_output);
  }
}

// Because converting 32-bit accumulated results down to eight bit is a common
// case, we have a specialized code path to handle it as efficiently as
// possible using only fixed-point math for the inner loop.
inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
                                              float min_input, float max_input,
                                              float min_output,
                                              float max_output,
                                              quint8* output) {
  // Initially we calculate all the constants we need once, before we go into
  // the inner loop.  If this is updated, also update the Eigen version.
  const int fp_shift = 16;
  const float input_range = max_input - min_input;
  const float output_range = max_output - min_output;
  const float recip_output_range =
      output_range == 0.0 ? 0.0 : (255.0 / output_range);
  const float input_rezero = (min_input + max_input) / 2.0;
  const int64 range_scale_fp =
      output_range == 0.0 ? 0.0
                          : static_cast<int64>(255.0 * (1 << fp_shift) *
                                               input_range / output_range);
  const int64 input_offset_fp =
      static_cast<int64>(input_rezero * recip_output_range * (1 << fp_shift));
  const int64 output_offset_fp =
      output_range == 0.0
          ? 0
          : static_cast<int64>((1 << fp_shift) * (min_output * 255.0) /
                               output_range);
  const int64 rounding_delta = 1 << (fp_shift - 1);

  // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
  // that could be easily adapted for a SIMD implementation. It should also be
  // possible to perform all the calculations in 32-bit rather than 64, but
  // that's not been implemented yet.
  for (size_t index = 0; index < count; ++index) {
    const int64 input_value = static_cast<int64>(input[index]);
    const int64 fp_value =
        ((input_value * range_scale_fp) >> 32) + input_offset_fp;
    const int64 offset_intermediate = fp_value - output_offset_fp;
    const int64 round_intermediate = offset_intermediate + rounding_delta;
    int64 quantized_int64 = round_intermediate >> fp_shift;
    quantized_int64 = std::max(quantized_int64, 0LL);
    quantized_int64 = std::min(quantized_int64, 255LL);
    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
  }
}

// Another common case is converting eight bit inputs up to thirty two bits, so
// we have specialized fixed-point code to accelerate that. There is also a NEON
// version for ARM devices below.
inline void RequantizeManyInNewRange8To32BitReference(
    const quint8* input, int64 count, float min_input, float max_input,
    float min_output, float max_output, qint32* output) {
  const float code_0_float = QuantizedToFloat<quint8>(0, min_input, max_input);
  const float code_1_float = QuantizedToFloat<quint8>(1, min_input, max_input);
  const int64 code_0_int64 =
      FloatToQuantizedUnclamped<qint32>(code_0_float, min_output, max_output);
  const int64 code_1_int64 =
      FloatToQuantizedUnclamped<qint32>(code_1_float, min_output, max_output);
  const int32 mult_int32 = code_1_int64 - code_0_int64;
  const int64 lowest_quantized =
      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
  const int64 highest_quantized =
      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
  for (int64 i = 0; i < count; ++i) {
    const int64 input_value = static_cast<int64>(input[i]);
    int64 output_value = code_0_int64 + (input_value * mult_int32);
    output_value = std::max(output_value, lowest_quantized);
    output_value = std::min(output_value, highest_quantized);
    output[i] = static_cast<int32>(output_value);
  }
}

#ifdef QUANTIZATION_UTILS_USE_NEON
// Speeds up the 32->8bit conversion using fixed-point arithmetic and NEON SIMD
// intrinsics for ARM platforms.
inline void RequantizeManyInNewRangeNeon(const qint32* input, int64 count,
                                         float min_input, float max_input,
                                         float min_output, float max_output,
                                         quint8* output) {
  // Initially we calculate all the constants we need once, before we go into
  // the inner loop.  If this is updated, also update the Eigen version.
  const int fp_shift = 16;

  // Calculate range variables in advance.
  // Input range.
  const float input_range = max_input - min_input;
  // Output range.
  const float output_range = max_output - min_output;
  // Ratio of output range.
  const float recip_output_range =
      output_range == 0.0 ? 0.0 : (255.0 / output_range);
  // Average of input range as zero position of input.
  const float input_rezero = (min_input + max_input) / 2.0;
  // In-out range scale.
  const int32 range_scale_fp =
      output_range == 0.0 ? 0.0
                          : static_cast<int32>(255.0 * (1 << (fp_shift - 16)) *
                                               input_range / output_range);
  // Input zero position offset to output.
  const int32 input_offset_fp =
      static_cast<int32>(input_rezero * recip_output_range * (1 << fp_shift));
  // Output min offset.
  const int32 output_offset_fp =
      output_range == 0.0
          ? 0
          : static_cast<int32>((1 << fp_shift) * (min_output * 255.0) /
                               output_range);
  const int32 rounding_delta = 1 << (fp_shift - 1);

  // broadcast range to each lane
  const int32x4_t range_scale_fp_32x4 = vmovq_n_s32(range_scale_fp);
  const int32x4_t input_offset_fp_32x4 = vmovq_n_s32(input_offset_fp);
  const int32x4_t output_offset_fp_32x4 = vmovq_n_s32(output_offset_fp);
  const int32x4_t rounding_delta_32x4 = vmovq_n_s32(rounding_delta);

  int64 index = 0;
  // Use SIMD to requantize.
  for (; index < (count - 7); index += 8) {
    const int32* input_ptr = &(input->value) + index;
    const int32x4_t input_value_low_32x4 = vld1q_s32(input_ptr);
    const int32x4_t input_value_high_32x4 = vld1q_s32(input_ptr + 4);
    const int32x4_t fp_value_low_32x4 = vaddq_s32(
        input_offset_fp_32x4,
        vmulq_s32(vshrq_n_s32(input_value_low_32x4, 16), range_scale_fp_32x4));
    const int32x4_t fp_value_high_32x4 = vaddq_s32(
        input_offset_fp_32x4,
        vmulq_s32(vshrq_n_s32(input_value_high_32x4, 16), range_scale_fp_32x4));
    const int32x4_t offset_intermediate_low_32x4 =
        vsubq_s32(fp_value_low_32x4, output_offset_fp_32x4);
    const int32x4_t offset_intermediate_high_32x4 =
        vsubq_s32(fp_value_high_32x4, output_offset_fp_32x4);
    const int32x4_t round_intermediate_low_32x4 =
        vaddq_s32(offset_intermediate_low_32x4, rounding_delta_32x4);
    const int32x4_t round_intermediate_high_32x4 =
        vaddq_s32(offset_intermediate_high_32x4, rounding_delta_32x4);
    const int16x4_t quantized_low_16x4 =
        vqmovn_s32(vshrq_n_s32(round_intermediate_low_32x4, fp_shift));
    const int16x4_t quantized_high_16x4 =
        vqmovn_s32(vshrq_n_s32(round_intermediate_high_32x4, fp_shift));
    const uint8x8_t quantized_8x8 =
        vqmovun_s16(vcombine_s16(quantized_low_16x4, quantized_high_16x4));
    uint8* output_ptr = &(output->value) + index;
    vst1_u8(output_ptr, quantized_8x8);
  }

  // Requantize remaining elements in array without SIMD.
  for (; index < count; ++index) {
    const int32 input_value = static_cast<int32>(input[index]);
    const int32 fp_value =
        static_cast<int32>(
            (static_cast<int32>(input_value >> 16) * (range_scale_fp))) +
        input_offset_fp;
    const int32 offset_intermediate = fp_value - output_offset_fp;
    const int32 round_intermediate = offset_intermediate + rounding_delta;
    int32 quantized_int32 = round_intermediate >> fp_shift;
    quantized_int32 = std::max(quantized_int32, 0);
    quantized_int32 = std::min(quantized_int32, 255);
    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int32));
  }
}

template <>
inline void RequantizeManyInNewRange<qint32, quint8>(
    const qint32* input, int64 count, float min_input, float max_input,
    float min_output, float max_output, quint8* output) {
  const float input_range = max_input - min_input;
  const float output_range = max_output - min_output;
  if ((input_range / output_range) > 16384.0f) {
    // Our NEON implementation uses 32-bit math and can't handle very
    // large ranges, so fall back to the reference implementation. We don't
    // expect these to be common in models, so this shouldn't be a performance
    // problem in practice.
    RequantizeManyInNewRangeReference(input, count, min_input, max_input,
                                      min_output, max_output, output);
  } else {
    RequantizeManyInNewRangeNeon(input, count, min_input, max_input, min_output,
                                 max_output, output);
  }
}

// NEON accelerated 16bit rounded division by 2^n.
template <int POW>
inline int16x8_t Divide16x8PowRound(const int16x8_t val) {
  const int16x8_t val_sign = vshrq_n_s16(val, 15);
  const int16x8_t val_xor = veorq_s16(val, val_sign);
  const int16x8_t val_pos = vsubq_s16(val_xor, val_sign);
  const int16x8_t shifted_val_pos = vrshrq_n_s16(val_pos, POW);
  const int16x8_t shifted_val_pos_xor = veorq_s16(shifted_val_pos, val_sign);
  const int16x8_t shifted_val = vsubq_s16(shifted_val_pos_xor, val_sign);
  return shifted_val;
}

// NEON accelerated 64bit rounded division by 2^n.
template <int POW>
inline int64x2_t Divide64x2PowRound(const int64x2_t val) {
  const int64x2_t val_sign = vshrq_n_s64(val, 63);
  const int64x2_t val_xor = veorq_s64(val, val_sign);
  const int64x2_t val_pos = vsubq_s64(val_xor, val_sign);
  const int64x2_t shifted_val_pos = vrshrq_n_s64(val_pos, POW);
  const int64x2_t shifted_val_pos_xor = veorq_s64(shifted_val_pos, val_sign);
  const int64x2_t shifted_val = vsubq_s64(shifted_val_pos_xor, val_sign);
  return shifted_val;
}

// NEON accelerated 16bit division by 2^n.
// CAVEAT: The input must be greater than min-int16 to avoid underflow.
template <int POW>
inline int16x8_t Divide16x8Pow(const int16x8_t val) {
  static constexpr int16 FIRST_BIT_VAL = 0x0000000000000001;
  static const int16x8_t FIRST_BIT = vmovq_n_s16(FIRST_BIT_VAL);
  const int16x8_t val_sign = vshrq_n_s16(val, 15);
  const int16x8_t neg_offset = vandq_s16(val_sign, FIRST_BIT);
  const int16x8_t val_with_offset = vsubq_s16(val, neg_offset);
  const int16x8_t shifted_wo_offset =
      vsraq_n_s16(neg_offset, val_with_offset, POW);
  return shifted_wo_offset;
}

// NEON accelerated 64bit division by 2^n.
// CAVEAT: The input must be greater than min-int64 to avoid underflow.
template <int POW>
inline int64x2_t Divide64x2Pow(const int64x2_t val) {
  static constexpr int64 FIRST_BIT_VAL = 0x0000000000000001;
  static const int64x2_t FIRST_BIT = vmovq_n_s64(FIRST_BIT_VAL);
  const int64x2_t val_sign = vshrq_n_s64(val, 63);
  const int64x2_t neg_offset = vandq_s64(val_sign, FIRST_BIT);
  const int64x2_t val_with_offset = vsubq_s64(val, neg_offset);
  const int64x2_t shifted_wo_offset =
      vsraq_n_s64(neg_offset, val_with_offset, POW);
  return shifted_wo_offset;
}

// 32bit x 2 NEON accelerated lerp computation.
template <int RESOLUTION>
inline int32x2_t ComputeLerp32x2(const int32x2_t top_left,
                                 const int32x2_t top_right,
                                 const int32x2_t bottom_left,
                                 const int32x2_t bottom_right,
                                 const int32x2_t x_lerp,
                                 const int32x2_t y_lerp) {
  static_assert(RESOLUTION < 31, "RESOLUTION must be less than 31");
  constexpr int32 RESOLUTION_MULT32 = (1 << RESOLUTION);
  static const int32x2_t RESOLUTION_MULT32x2 = vmov_n_s32(RESOLUTION_MULT32);

  const int64x2_t top_left_x_res = vmull_s32(top_left, RESOLUTION_MULT32x2);
  const int64x2_t bottom_left_x_res =
      vmull_s32(bottom_left, RESOLUTION_MULT32x2);

  const int32x2_t top_right_sub_top_left = vsub_s32(top_right, top_left);
  const int64x2_t top_x_res =
      vmlal_s32(top_left_x_res, top_right_sub_top_left, x_lerp);
  const int32x2_t bottom_right_sub_bottom_left =
      vsub_s32(bottom_right, bottom_left);
  const int64x2_t bottom_x_res =
      vmlal_s32(bottom_left_x_res, bottom_right_sub_bottom_left, x_lerp);

  const int64x2_t bottom_sub_top_x_res = vsubq_s64(bottom_x_res, top_x_res);
  const int64x2_t bottom_sub_top =
      Divide64x2Pow<RESOLUTION>(bottom_sub_top_x_res);
  const int32x2_t bottom_sub_top_32 = vqmovn_s64(bottom_sub_top);
  const int64x2_t top_add_bottom_sub_top_mul_ylerp_x_res =
      vmlal_s32(top_x_res, bottom_sub_top_32, y_lerp);
  const int64x2_t retval =
      Divide64x2PowRound<RESOLUTION>(top_add_bottom_sub_top_mul_ylerp_x_res);
  const int32x2_t retval32 = vqmovn_s64(retval);
  return retval32;
}

// 8bit x 8 NEON accelerated lerp computation.
template <int RESOLUTION>
inline uint8x8_t ComputeLerp8x8(const uint8x8_t top_left8x8,
                                const uint8x8_t top_right8x8,
                                const uint8x8_t bottom_left8x8,
                                const uint8x8_t bottom_right8x8,
                                const int16x8_t x_lerp,
                                const int16x8_t y_lerp) {
  static_assert(RESOLUTION < 8, "RESOLUTION must be less than 8");
  constexpr uint8 RESOLUTION_MULT_VAL = (1 << RESOLUTION);
  static const uint8x8_t RESOLUTION_MULT = vdup_n_u8(RESOLUTION_MULT_VAL);

  const int16x8_t top_left_x_res =
      vreinterpretq_s16_u16(vmull_u8(top_left8x8, RESOLUTION_MULT));
  const int16x8_t bottom_left_x_res =
      vreinterpretq_s16_u16(vmull_u8(bottom_left8x8, RESOLUTION_MULT));

  const int16x8_t top_right_sub_top_left =
      vreinterpretq_s16_u16(vsubl_u8(top_right8x8, top_left8x8));
  const int16x8_t top_x_res =
      vmlaq_s16(top_left_x_res, top_right_sub_top_left, x_lerp);

  const int16x8_t bottom_right_sub_bottom_left =
      vreinterpretq_s16_u16(vsubl_u8(bottom_right8x8, bottom_left8x8));
  const int16x8_t bottom_x_res =
      vmlaq_s16(bottom_left_x_res, bottom_right_sub_bottom_left, x_lerp);

  const int16x8_t bottom_sub_top_x_res = vsubq_s16(bottom_x_res, top_x_res);
  const int16x8_t bottom_sub_top =
      Divide16x8Pow<RESOLUTION>(bottom_sub_top_x_res);
  const int16x8_t top_add_bottom_sub_top_mul_ylerp_x_res =
      vmlaq_s16(top_x_res, bottom_sub_top, y_lerp);
  const int16x8_t retval16 =
      Divide16x8PowRound<RESOLUTION>(top_add_bottom_sub_top_mul_ylerp_x_res);
  const uint8x8_t retval = vmovn_u16(vreinterpretq_u16_s16(retval16));
  return retval;
}

// Requantize 8 x 8 quints to 8 x 32 qints in parallel by neon
// Return std::array instead of pointer to leverage return value optimization
inline std::array<int32x4_t, 2> Requantize8x8To32Neon(
    const uint8* input_ptr, const int64x2_t input_0_64x2,
    const int32x2_t input_mult_32x2) {
  const uint8x8_t input_value_8x8 = vld1_u8(input_ptr);
  const int16x8_t input_value_16x8 =
      vreinterpretq_s16_u16(vmovl_u8(input_value_8x8));
  const int16x4_t input_value_low_16x4 = vget_low_s16(input_value_16x8);
  const int16x4_t input_value_high_16x4 = vget_high_s16(input_value_16x8);
  const int32x4_t input_value_low_32x4 = vmovl_s16(input_value_low_16x4);
  const int32x4_t input_value_high_32x4 = vmovl_s16(input_value_high_16x4);
  const int32x2_t input_value_low_low_32x2 = vget_low_s32(input_value_low_32x4);
  const int32x2_t input_value_low_high_32x2 =
      vget_high_s32(input_value_low_32x4);
  const int32x2_t input_value_high_low_32x2 =
      vget_low_s32(input_value_high_32x4);
  const int32x2_t input_value_high_high_32x2 =
      vget_high_s32(input_value_high_32x4);
  const int64x2_t mult_result_low_low_64x2 =
      vmlal_s32(input_0_64x2, input_value_low_low_32x2, input_mult_32x2);
  const int64x2_t mult_result_low_high_64x2 =
      vmlal_s32(input_0_64x2, input_value_low_high_32x2, input_mult_32x2);
  const int64x2_t mult_result_high_low_64x2 =
      vmlal_s32(input_0_64x2, input_value_high_low_32x2, input_mult_32x2);
  const int64x2_t mult_result_high_high_64x2 =
      vmlal_s32(input_0_64x2, input_value_high_high_32x2, input_mult_32x2);
  const int32x2_t output_value_low_low_32x2 =
      vqmovn_s64(mult_result_low_low_64x2);
  const int32x2_t output_value_low_high_32x2 =
      vqmovn_s64(mult_result_low_high_64x2);
  const int32x2_t output_value_high_low_32x2 =
      vqmovn_s64(mult_result_high_low_64x2);
  const int32x2_t output_value_high_high_32x2 =
      vqmovn_s64(mult_result_high_high_64x2);
  const int32x4_t output_value_low_32x4 =
      vcombine_s32(output_value_low_low_32x2, output_value_low_high_32x2);
  const int32x4_t output_value_high_32x4 =
      vcombine_s32(output_value_high_low_32x2, output_value_high_high_32x2);
  return std::array<int32x4_t, 2>{
      {output_value_low_32x4, output_value_high_32x4}};
}

// Speeds up the 8->32bit conversion using fixed-point arithmetic and NEON SIMD
// intrinsics for ARM platforms.
template <>
inline void RequantizeManyInNewRange<quint8, qint32>(
    const quint8* input, int64 count, float min_input, float max_input,
    float min_output, float max_output, qint32* output) {
  // Pre-calculate zero position and multiplier.
  // Calculate 0 and 1 value in float.
  const float code_0_float = QuantizedToFloat<quint8>(0, min_input, max_input);
  const float code_1_float = QuantizedToFloat<quint8>(1, min_input, max_input);

  // Cast 0 and 1 value in int64.
  const int64 code_0_int64 =
      FloatToQuantizedUnclamped<qint32>(code_0_float, min_output, max_output);
  const int64 code_1_int64 =
      FloatToQuantizedUnclamped<qint32>(code_1_float, min_output, max_output);

  // Calculate multiplier.
  const int32 mult_int32 = static_cast<int32>(code_1_int64 - code_0_int64);

  // Broadcast 0 position and multiplier to lanes
  const int64x2_t code_0_64x2 = vmovq_n_s64(code_0_int64);
  const int32x2_t mult_32x2 = vmov_n_s32(mult_int32);

  int64 i = 0;

  // Use SIMD to requantize array.
  for (; i < (count - 7); i += 8) {
    const uint8* input_ptr = &(input->value) + i;
    int32* output_ptr = &(output->value) + i;
    const std::array<int32x4_t, 2> output_value =
        Requantize8x8To32Neon(input_ptr, code_0_64x2, mult_32x2);
    vst1q_s32(output_ptr + 0, output_value[0]);
    vst1q_s32(output_ptr + 4, output_value[1]);
  }

  // Requantize remaining elements in array without SIMD.
  const int64 lowest_quantized =
      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
  const int64 highest_quantized =
      static_cast<int64>(Eigen::NumTraits<qint32>::highest());

  for (; i < count; ++i) {
    const int64 input_value = static_cast<int64>(input[i]);
    int64 output_value = code_0_int64 + (input_value * mult_int32);
    output_value = std::max(output_value, lowest_quantized);
    output_value = std::min(output_value, highest_quantized);
    output[i] = static_cast<int32>(output_value);
  }
}

#else

// If SIMD implementations aren't available, then use these default reference
// versions.
template <>
inline void RequantizeManyInNewRange<qint32, quint8>(
    const qint32* input, int64 count, float min_input, float max_input,
    float min_output, float max_output, quint8* output) {
  RequantizeManyInNewRangeReference(input, count, min_input, max_input,
                                    min_output, max_output, output);
}

template <>
inline void RequantizeManyInNewRange<quint8, qint32>(
    const quint8* input, int64 count, float min_input, float max_input,
    float min_output, float max_output, qint32* output) {
  RequantizeManyInNewRange8To32BitReference(input, count, min_input, max_input,
                                            min_output, max_output, output);
}

#endif

template <int shift>
struct int64_right_shift_op {
  EIGEN_EMPTY_STRUCT_CTOR(int64_right_shift_op)
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE const int64 operator()(const int64& a) const {
    return a >> shift;
  }
};

// See RequantizeManyInNewRange() for a non-eigen reference implementation.
template <class T1, class T2>
inline void RequantizeManyInNewRangeUsingEigen(
    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min_input,
    float max_input, float min_output, float max_output, Tensor* output) {
  auto input_array = input.flat<T1>();
  QuantizedToFloatStruct<T1> q2f(min_input, max_input);
  auto input_float = DEQUANTIZE_WITH_EIGEN(input_array, q2f);
  FloatToQuantizedStruct<T2> f2q(min_output, max_output);
  auto input_requantized = QUANTIZE_WITH_EIGEN(input_float, f2q, T2);

  output->flat<T2>().device(device) = input_requantized;
}

// See RequantizeManyInNewRange() for a non-eigen reference implementation.
//
// Because converting 32-bit accumulated results down to eight bit is a common
// case, we have a specialized code path to handle it as efficiently as
// possible using only fixed-point math for the inner loop.
template <>
inline void RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min_input,
    float max_input, float min_output, float max_output, Tensor* output) {
  // Initially we calculate all the constants we need once, before we go into
  // the inner loop.  If this is updated, also update the non-Eigen version.
  const int fp_shift = 16;
  const float input_range = max_input - min_input;
  const float output_range = max_output - min_output;
  const float recip_output_range =
      output_range == 0.0 ? 0.0 : (255.0 / output_range);
  const float input_rezero = (min_input + max_input) / 2.0;
  const int64 range_scale_fp =
      output_range == 0.0 ? 0.0
                          : static_cast<int64>(255.0 * (1 << fp_shift) *
                                               input_range / output_range);
  const int64 input_offset_fp =
      static_cast<int64>(input_rezero * recip_output_range * (1 << fp_shift));
  const int64 output_offset_fp =
      output_range == 0.0
          ? 0
          : static_cast<int64>((1 << fp_shift) * (min_output * 255.0) /
                               output_range);
  const int64 rounding_delta = 1 << (fp_shift - 1);

  // Inside this eigen expression we just do minimal adds, multiplies, and
  // shifts. It should be possible to perform all the calculations in 32-bit
  // rather than 64, but that's not been implemented yet.
  auto input_array = input.flat<qint32>();
  auto fp_value = ((input_array.template cast<int64>() * range_scale_fp)
                       .unaryExpr(int64_right_shift_op<32>())) +
                  (input_offset_fp - output_offset_fp + rounding_delta);
  auto intermediate = fp_value.unaryExpr(int64_right_shift_op<fp_shift>());
  auto input_requantized = intermediate.cwiseMax(0LL)
                               .cwiseMin(255LL)
                               .template cast<int32>()
                               .template cast<quint8>();
  output->flat<quint8>().device(device) = input_requantized;
}

// REQUIRES: 'result->NumElements() == input.NumElements()'
template <class T>
void FloatTensorToQuantizedInPlaceUsingEigen(
    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min,
    float max, Tensor* result) {
  DCHECK_EQ(DataTypeToEnum<T>::v(), result->dtype());
  auto flat_input = input.flat<float>();
  auto flat_result = result->flat<T>();
  DCHECK_EQ(flat_input.size(), flat_result.size());

  FloatToQuantizedStruct<T> f2q(min, max);
  flat_result.device(device) = QUANTIZE_WITH_EIGEN(flat_input, f2q, T);
}

template <class T>
void FloatTensorToQuantizedInPlace(const Tensor& input, float min, float max,
                                   Tensor* result) {
  DCHECK_EQ(DataTypeToEnum<T>::v(), result->dtype());
  auto flat_input = input.flat<float>();
  auto flat_result = result->flat<T>();
  const int data_size = flat_input.size();
  DCHECK(data_size == flat_result.size());
  for (int i = 0; i < data_size; ++i) {
    flat_result(i) = FloatToQuantized<T>(flat_input(i), min, max);
  }
}

template <class T>
Tensor FloatTensorToQuantized(const Tensor& input, float min, float max) {
  Tensor result(DataTypeToEnum<T>::v(), input.shape());
  FloatTensorToQuantizedInPlace<T>(input, min, max, &result);
  return result;
}

// REQUIRES: 'result->NumElements() == input.NumElements()'
template <class T>
void QuantizedTensorToFloatInPlaceUsingEigen(
    const Eigen::ThreadPoolDevice& device, const Tensor& input, float min,
    float max, Tensor* result) {
  DCHECK_EQ(DataTypeToEnum<T>::v(), input.dtype());
  auto flat_input = input.flat<T>();
  auto flat_result = result->flat<float>();
  const int data_size = flat_input.size();
  DCHECK(data_size == flat_result.size());

  QuantizedToFloatStruct<T> q2f(min, max);
  flat_result.device(device) = DEQUANTIZE_WITH_EIGEN(flat_input, q2f);
}

// REQUIRES: 'result->NumElements() == input.NumElements()'
template <class T>
void QuantizedTensorToFloatInPlace(const Tensor& input, float min, float max,
                                   Tensor* result) {
  DCHECK_EQ(DataTypeToEnum<T>::v(), input.dtype());
  auto flat_input = input.flat<T>();
  auto flat_result = result->flat<float>();
  const int data_size = flat_input.size();
  DCHECK(data_size == flat_result.size());
  for (int i = 0; i < data_size; ++i) {
    flat_result(i) = QuantizedToFloat<T>(flat_input(i), min, max);
  }
}

template <class T>
Tensor QuantizedTensorToFloat(const Tensor& input, float min, float max) {
  Tensor result(DT_FLOAT, input.shape());
  QuantizedTensorToFloatInPlace<T>(input, min, max, &result);
  return result;
}

void GetOutputMinAndMaxForQuantizedAdd(float input_min, float input_max,
                                       float smaller_input_min,
                                       float smaller_input_max,
                                       float* output_min, float* output_max);

// Add <input> and <smaller_input>.  If <smaller_input> has fewer elements than
// <input>, then it is broadcast onto <input>.
template <typename T1, typename T2, typename T3>
void QuantizedAddUsingEigen(const Eigen::ThreadPoolDevice& device,
                            const Tensor& input, float input_min,
                            float input_max, const Tensor& smaller_input,
                            float smaller_input_min, float smaller_input_max,
                            Tensor* output, float* output_min,
                            float* output_max) {
  const auto& input_flat = input.flat<T1>();
  const auto& smaller_input_flat = smaller_input.flat<T2>();
  auto output_flat = output->flat<T3>();

  GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, smaller_input_min,
                                    smaller_input_max, output_min, output_max);
  // To do addition properly, we need to compensate for a possibly unbalanced
  // zero point in the total representation. The quantized value that
  // represents the real number zero needs to be subtracted before addition to
  // make sure that the identity of zero + zero = zero holds.
  const T3 zero_in_total_space =
      FloatToQuantized<T3>(0.0f, *output_min, *output_max);

  const int64 input_element_count = input.NumElements();
  const int64 smaller_input_element_count = smaller_input.NumElements();

  QuantizedToFloatStruct<T1> input_q2f(input_min, input_max);
  QuantizedToFloatStruct<T2> smaller_input_q2f(smaller_input_min,
                                               smaller_input_max);
  FloatToQuantizedStruct<T3> f2q(*output_min, *output_max);

  auto smaller_input_float =
      DEQUANTIZE_WITH_EIGEN(smaller_input_flat, smaller_input_q2f);
  auto smaller_input_in_total_space =
      QUANTIZE_WITH_EIGEN(smaller_input_float, f2q, T3);

  auto input_float = DEQUANTIZE_WITH_EIGEN(input_flat, input_q2f);
  auto input_in_total_space = QUANTIZE_WITH_EIGEN(input_float, f2q, T3);

  Eigen::array<Eigen::DenseIndex, 1> bcast;
  bcast[0] = input_element_count / smaller_input_element_count;
  output_flat.device(device) =
      input_in_total_space +
      (smaller_input_in_total_space.broadcast(bcast) + zero_in_total_space);
}

// This is a reference implementation of the bias addition for quantized
// buffers, designed to provide a clear specification for the result we
// want. We'll want to specialize this for particular hardware, and
// probably even fuse it with matrix multiplications in a lot of cases. It's
// important to show the clamping behavior we want in particular.
template <typename T1, typename T2, typename T3>
void QuantizedAdd(const Eigen::ThreadPoolDevice& device, const Tensor& input,
                  float input_min, float input_max, const Tensor& smaller_input,
                  float smaller_input_min, float smaller_input_max,
                  Tensor* output, float* output_min, float* output_max) {
  const auto& input_flat = input.flat<T1>();
  const auto& smaller_input_flat = smaller_input.flat<T2>();
  auto output_flat = output->flat<T3>();

  GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, smaller_input_min,
                                    smaller_input_max, output_min, output_max);
  // To do addition properly, we need to compensate for a possibly unbalanced
  // zero point in the total representation. The quantized value that
  // represents the real number zero needs to be subtracted before addition to
  // make sure that the identity of zero + zero = zero holds.
  const T3 zero_in_total_space =
      FloatToQuantized<T3>(0.0f, *output_min, *output_max);

  const int64 input_element_count = input.NumElements();
  const int64 smaller_input_element_count = smaller_input.NumElements();

  float total_min = *output_min;
  float total_max = *output_max;
  const size_t how_many_iterations =
      (input_element_count / smaller_input_element_count);
  for (size_t iteration = 0; iteration < how_many_iterations; ++iteration) {
    const size_t offset = iteration * smaller_input_element_count;
    for (int c = 0; c < smaller_input_element_count; ++c) {
      const int index = (offset + c);
      // The two numbers we're going to add can each be in very different
      // ranges (e.g. the quantized value '127' may represent very different
      // real numbers in both) so we need to convert them to a common range
      // before we sum them.
      const T1 input_value = input_flat(index);
      const T3 input_in_total_space = RequantizeInNewRange<T1, T3>(
          input_value, input_min, input_max, total_min, total_max);
      const T2 smaller_input_value = smaller_input_flat(c);
      const T3 smaller_input_in_total_space =
          RequantizeInNewRange<T2, T3>(smaller_input_value, smaller_input_min,
                                       smaller_input_max, total_min, total_max);
      const T3 total_pre = input_in_total_space + smaller_input_in_total_space;
      // As noted above, we need to compensate for the offset of the actual
      // zero point in the space we're operating in.
      const T3 total = total_pre + zero_in_total_space;
      output_flat(index) = total;
    }
  }
}

// See gemmlowp/internal/multi_thread_gemm.h for the semantics of Execute.
class TensorflowGemmlowpWorkersPool {
 public:
  TensorflowGemmlowpWorkersPool(thread::ThreadPool* workers)
      : workers_(workers) {}

  ~TensorflowGemmlowpWorkersPool() {
    // This workaround ensures that all worker tasks have exited methods in the
    // BlockingCounter. Without this, there is a race where the context is torn
    // down while the counter is in use.
    counter_to_decrement_when_ready_.Reset(0);
  }

  void Execute(const std::vector<gemmlowp::Task*>& tasks) {
    assert(!tasks.empty());
    assert(workers_ != nullptr);
    counter_to_decrement_when_ready_.Reset(tasks.size());
    for (gemmlowp::Task* task : tasks) {
      workers_->Schedule([this, task]() {
        // TODO(cwhipkey): get a local_allocator from a thread local storage.
        gemmlowp::Allocator local_allocator;
        CHECK(task != nullptr);
        task->local_allocator = &local_allocator;
        task->Run();
        counter_to_decrement_when_ready_.DecrementCount();
      });
    }
    counter_to_decrement_when_ready_.Wait();
    for (gemmlowp::Task* task : tasks) {
      delete task;
    }
  }

 private:
  thread::ThreadPool* const workers_;

  // The BlockingCounter used to wait for the workers.
  gemmlowp::BlockingCounter counter_to_decrement_when_ready_;

  TF_DISALLOW_COPY_AND_ASSIGN(TensorflowGemmlowpWorkersPool);
};

class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
 public:
  TensorflowGemmContext(int num_threads, thread::ThreadPool* workers)
      : workers_pool_(workers) {
    set_max_num_threads(num_threads);
  }

  TensorflowGemmlowpWorkersPool* workers_pool() { return &workers_pool_; }

 private:
  TensorflowGemmlowpWorkersPool workers_pool_;

  TF_DISALLOW_COPY_AND_ASSIGN(TensorflowGemmContext);
};

}  // namespace tensorflow

#endif  // TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_