Home | History | Annotate | Download | only in reference
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
     16 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
     17 
     18 #include <stdint.h>
     19 #include <sys/types.h>
     20 #include <algorithm>
     21 #include <cmath>
     22 #include <limits>
     23 #include <memory>
     24 #include <type_traits>
     25 
     26 #include "Eigen/Core"
     27 #include "fixedpoint/fixedpoint.h"
     28 #include "public/gemmlowp.h"
     29 #include "tensorflow/contrib/lite/kernels/internal/common.h"
     30 #include "tensorflow/contrib/lite/kernels/internal/round.h"
     31 #include "tensorflow/contrib/lite/kernels/internal/types.h"
     32 
     33 namespace tflite {
     34 namespace reference_ops {
     35 
     36 inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
     37     int32 x, int32 quantized_multiplier, int right_shift) {
     38   using gemmlowp::RoundingDivideByPOT;
     39   using gemmlowp::SaturatingRoundingDoublingHighMul;
     40   return RoundingDivideByPOT(
     41       SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
     42 }
     43 
     44 inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
     45     int32 x, int32 quantized_multiplier, int left_shift) {
     46   using gemmlowp::SaturatingRoundingDoublingHighMul;
     47   return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
     48                                            quantized_multiplier);
     49 }
     50 
     51 template <typename T>
     52 int CountLeadingZeros(T integer_input) {
     53   static_assert(std::is_unsigned<T>::value,
     54                 "Only unsigned integer types handled.");
     55   const T one_in_leading_positive = static_cast<T>(1)
     56                                     << (std::numeric_limits<T>::digits - 1);
     57   int leading_zeros = 0;
     58   while (integer_input < one_in_leading_positive) {
     59     integer_input <<= 1;
     60     ++leading_zeros;
     61   }
     62   return leading_zeros;
     63 }
     64 
     65 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
     66 // BROADCASTING.
     67 //
     68 // NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
     69 // rectangular array of numbers.
     70 //
     71 // NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
     72 // However, as Dims<N> is to be deprecated, this class exists as an adaptor
     73 // to enable simple unoptimized implementations of element-wise broadcasting
     74 // operations.
     75 template <int N>
     76 struct NdArrayDesc {
     77   // The "extent" of each dimension. Indices along dimension d must be in the
     78   // half-open interval [0, extents[d]).
     79   int extents[N];
     80 
     81   // The number of *elements* (not bytes) between consecutive indices of each
     82   // dimension.
     83   int strides[N];
     84 };
     85 
     86 // DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
     87 // ELEMENT-WISE BROADCASTING.
     88 //
     89 // Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
     90 inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
     91                             int i3) {
     92   TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
     93   TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
     94   TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
     95   TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
     96   return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
     97          i3 * desc.strides[3];
     98 }
     99 
    100 // Given the dimensions of the operands for an element-wise binary broadcast,
    101 // adjusts them so that they can be directly iterated over with simple loops.
    102 // Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
    103 // 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
    104 //
    105 // This function assumes that the two input shapes are compatible up to
    106 // broadcasting and the shorter one has already been prepended with 1s to be the
    107 // same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
    108 // shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
    109 // Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
    110 // (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
    111 //
    112 // When two shapes are compatible up to broadcasting, for each dimension d,
    113 // the input extents are either equal, or one of them is 1.
    114 //
    115 // This function performs the following for each dimension d:
    116 // - If the extents are equal, then do nothing since the loop that walks over
    117 //   both of the input arrays is correct.
    118 // - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
    119 //   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
    120 //   array0 to be referenced *at any index* in dimension d and still access the
    121 //   same slice.
    122 template <int N>
    123 inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
    124                                                 const Dims<N>& input1_dims,
    125                                                 NdArrayDesc<N>* desc0_out,
    126                                                 NdArrayDesc<N>* desc1_out) {
    127   TFLITE_DCHECK(desc0_out != nullptr);
    128   TFLITE_DCHECK(desc1_out != nullptr);
    129 
    130   // Copy dims to desc.
    131   for (int i = 0; i < N; ++i) {
    132     desc0_out->extents[i] = input0_dims.sizes[i];
    133     desc0_out->strides[i] = input0_dims.strides[i];
    134     desc1_out->extents[i] = input1_dims.sizes[i];
    135     desc1_out->strides[i] = input1_dims.strides[i];
    136   }
    137 
    138   // Walk over each dimension. If the extents are equal do nothing.
    139   // Otherwise, set the desc with extent 1 to have extent equal to the other and
    140   // stride 0.
    141   for (int i = 0; i < N; ++i) {
    142     const int extent0 = ArraySize(input0_dims, i);
    143     const int extent1 = ArraySize(input1_dims, i);
    144     if (extent0 != extent1) {
    145       if (extent0 == 1) {
    146         desc0_out->strides[i] = 0;
    147         desc0_out->extents[i] = extent1;
    148       } else {
    149         TFLITE_DCHECK_EQ(extent1, 1);
    150         desc1_out->strides[i] = 0;
    151         desc1_out->extents[i] = extent0;
    152       }
    153     }
    154   }
    155 }
    156 
    157 inline void Conv(const float* input_data, const Dims<4>& input_dims,
    158                  const float* filter_data, const Dims<4>& filter_dims,
    159                  const float* bias_data, const Dims<4>& bias_dims,
    160                  int stride_width, int stride_height, int pad_width,
    161                  int pad_height, float output_activation_min,
    162                  float output_activation_max, float* output_data,
    163                  const Dims<4>& output_dims, float* im2col_data,
    164                  const Dims<4>& im2col_dims) {
    165   (void)im2col_data;  // only used in optimized code.
    166   (void)im2col_dims;  // only used in optimized code.
    167   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    168   const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
    169   const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
    170   if (bias_data) {
    171     TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
    172   }
    173   const int input_height = ArraySize(input_dims, 2);
    174   const int input_width = ArraySize(input_dims, 1);
    175   const int filter_height = ArraySize(filter_dims, 2);
    176   const int filter_width = ArraySize(filter_dims, 1);
    177   const int output_height = ArraySize(output_dims, 2);
    178   const int output_width = ArraySize(output_dims, 1);
    179   for (int batch = 0; batch < batches; ++batch) {
    180     for (int out_y = 0; out_y < output_height; ++out_y) {
    181       for (int out_x = 0; out_x < output_width; ++out_x) {
    182         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
    183           const int in_x_origin = (out_x * stride_width) - pad_width;
    184           const int in_y_origin = (out_y * stride_height) - pad_height;
    185           float total = 0.f;
    186           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
    187             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
    188               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
    189                 const int in_x = in_x_origin + filter_x;
    190                 const int in_y = in_y_origin + filter_y;
    191                 // If the location is outside the bounds of the input image,
    192                 // use zero as a default value.
    193                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
    194                     (in_y < input_height)) {
    195                   float input_value = input_data[Offset(input_dims, in_channel,
    196                                                         in_x, in_y, batch)];
    197                   float filter_value =
    198                       filter_data[Offset(filter_dims, in_channel, filter_x,
    199                                          filter_y, out_channel)];
    200                   total += (input_value * filter_value);
    201                 }
    202               }
    203             }
    204           }
    205           float bias_value = 0.0f;
    206           if (bias_data) {
    207             bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
    208           }
    209           output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
    210               ActivationFunctionWithMinMax(total + bias_value,
    211                                            output_activation_min,
    212                                            output_activation_max);
    213         }
    214       }
    215     }
    216   }
    217 }
    218 
    219 // legacy, for compatibility with old checked-in code
    220 template <FusedActivationFunctionType Ac>
    221 void Conv(const float* input_data, const Dims<4>& input_dims,
    222           const float* filter_data, const Dims<4>& filter_dims,
    223           const float* bias_data, const Dims<4>& bias_dims, int stride_width,
    224           int stride_height, int pad_width, int pad_height, float* output_data,
    225           const Dims<4>& output_dims, float* im2col_data,
    226           const Dims<4>& im2col_dims) {
    227   float output_activation_min, output_activation_max;
    228   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
    229   Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
    230        stride_width, stride_height, pad_width, pad_height,
    231        output_activation_min, output_activation_max, output_data, output_dims,
    232        im2col_data, im2col_dims);
    233 }
    234 
    235 // legacy, for compatibility with old checked-in code
    236 template <FusedActivationFunctionType Ac>
    237 void Conv(const float* input_data, const Dims<4>& input_dims,
    238           const float* filter_data, const Dims<4>& filter_dims,
    239           const float* bias_data, const Dims<4>& bias_dims, int stride,
    240           int pad_width, int pad_height, float* output_data,
    241           const Dims<4>& output_dims, float* im2col_data,
    242           const Dims<4>& im2col_dims) {
    243   Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
    244            bias_dims, stride, stride, pad_width, pad_height, output_data,
    245            output_dims, im2col_data, im2col_dims);
    246 }
    247 
    248 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
    249                  int32 input_offset, const uint8* filter_data,
    250                  const Dims<4>& filter_dims, int32 filter_offset,
    251                  const int32* bias_data, const Dims<4>& bias_dims,
    252                  int stride_width, int stride_height, int pad_width,
    253                  int pad_height, int32 output_offset, int32 output_multiplier,
    254                  int output_shift, int32 output_activation_min,
    255                  int32 output_activation_max, uint8* output_data,
    256                  const Dims<4>& output_dims, uint8* im2col_data,
    257                  const Dims<4>& im2col_dims,
    258                  gemmlowp::GemmContext* gemm_context) {
    259   (void)im2col_data;   // only used in optimized code.
    260   (void)im2col_dims;   // only used in optimized code.
    261   (void)gemm_context;  // only used in optimized code.
    262   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
    263   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    264   const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
    265   const int output_depth =
    266       MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0);
    267   const int input_height = ArraySize(input_dims, 2);
    268   const int input_width = ArraySize(input_dims, 1);
    269   const int filter_height = ArraySize(filter_dims, 2);
    270   const int filter_width = ArraySize(filter_dims, 1);
    271   const int output_height = ArraySize(output_dims, 2);
    272   const int output_width = ArraySize(output_dims, 1);
    273   for (int batch = 0; batch < batches; ++batch) {
    274     for (int out_y = 0; out_y < output_height; ++out_y) {
    275       for (int out_x = 0; out_x < output_width; ++out_x) {
    276         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
    277           const int in_x_origin = (out_x * stride_width) - pad_width;
    278           const int in_y_origin = (out_y * stride_height) - pad_height;
    279           int32 acc = 0;
    280           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
    281             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
    282               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
    283                 const int in_x = in_x_origin + filter_x;
    284                 const int in_y = in_y_origin + filter_y;
    285                 // If the location is outside the bounds of the input image,
    286                 // use zero as a default value.
    287                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
    288                     (in_y < input_height)) {
    289                   int32 input_val = input_data[Offset(input_dims, in_channel,
    290                                                       in_x, in_y, batch)];
    291                   int32 filter_val =
    292                       filter_data[Offset(filter_dims, in_channel, filter_x,
    293                                          filter_y, out_channel)];
    294                   acc +=
    295                       (filter_val + filter_offset) * (input_val + input_offset);
    296                 }
    297               }
    298             }
    299           }
    300           if (bias_data) {
    301             acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
    302           }
    303           acc = MultiplyByQuantizedMultiplierSmallerThanOne(
    304               acc, output_multiplier, output_shift);
    305           acc += output_offset;
    306           acc = std::max(acc, output_activation_min);
    307           acc = std::min(acc, output_activation_max);
    308           output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
    309               static_cast<uint8>(acc);
    310         }
    311       }
    312     }
    313   }
    314 }
    315 
    316 // legacy, for compatibility with old checked-in code
    317 template <FusedActivationFunctionType Ac>
    318 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
    319                  int32 input_offset, const uint8* filter_data,
    320                  const Dims<4>& filter_dims, int32 filter_offset,
    321                  const int32* bias_data, const Dims<4>& bias_dims,
    322                  int stride_width, int stride_height, int pad_width,
    323                  int pad_height, int32 output_offset, int32 output_multiplier,
    324                  int output_shift, int32 output_activation_min,
    325                  int32 output_activation_max, uint8* output_data,
    326                  const Dims<4>& output_dims, uint8* im2col_data,
    327                  const Dims<4>& im2col_dims,
    328                  gemmlowp::GemmContext* gemm_context) {
    329   static_assert(Ac == FusedActivationFunctionType::kNone ||
    330                     Ac == FusedActivationFunctionType::kRelu ||
    331                     Ac == FusedActivationFunctionType::kRelu6 ||
    332                     Ac == FusedActivationFunctionType::kRelu1,
    333                 "");
    334   if (Ac == FusedActivationFunctionType::kNone) {
    335     TFLITE_DCHECK_EQ(output_activation_min, 0);
    336     TFLITE_DCHECK_EQ(output_activation_max, 255);
    337   }
    338   Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
    339        filter_offset, bias_data, bias_dims, stride_width, stride_height,
    340        pad_width, pad_height, output_offset, output_multiplier, output_shift,
    341        output_activation_min, output_activation_max, output_data, output_dims,
    342        im2col_data, im2col_dims, gemm_context);
    343 }
    344 
    345 // legacy, for compatibility with old checked-in code
    346 template <FusedActivationFunctionType Ac>
    347 void Conv(const uint8* input_data, const Dims<4>& input_dims,
    348           int32 input_offset, const uint8* filter_data,
    349           const Dims<4>& filter_dims, int32 filter_offset,
    350           const int32* bias_data, const Dims<4>& bias_dims, int stride,
    351           int pad_width, int pad_height, int32 output_offset,
    352           int32 output_multiplier, int output_shift,
    353           int32 output_activation_min, int32 output_activation_max,
    354           uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
    355           const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
    356   Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
    357            filter_offset, bias_data, bias_dims, stride, stride, pad_width,
    358            pad_height, output_offset, output_multiplier, output_shift,
    359            output_activation_min, output_activation_max, output_data,
    360            output_dims, im2col_data, im2col_dims, gemm_context);
    361 }
    362 
    363 template <typename T>
    364 inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
    365                          int block_size, T* output_data,
    366                          const Dims<4>& output_dims) {
    367   const int input_depth = ArraySize(input_dims, 0);
    368   const int input_width = ArraySize(input_dims, 1);
    369   const int input_height = ArraySize(input_dims, 2);
    370   const int input_batch = ArraySize(input_dims, 3);
    371 
    372   const int output_depth = ArraySize(output_dims, 0);
    373   const int output_width = ArraySize(output_dims, 1);
    374   const int output_height = ArraySize(output_dims, 2);
    375   const int output_batch = ArraySize(output_dims, 3);
    376 
    377   TFLITE_DCHECK_EQ(input_width * block_size, output_width);
    378   TFLITE_DCHECK_EQ(input_height * block_size, output_height);
    379   TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
    380   TFLITE_DCHECK_EQ(input_batch, output_batch);
    381 
    382   for (int out_b = 0; out_b < output_batch; ++out_b) {
    383     for (int out_h = 0; out_h < output_height; ++out_h) {
    384       for (int out_w = 0; out_w < output_width; ++out_w) {
    385         for (int out_d = 0; out_d < output_depth; ++out_d) {
    386           const int in_d =
    387               out_d + ((out_h % block_size) * block_size + out_w % block_size) *
    388                           output_depth;
    389           const int in_w = out_w / block_size;
    390           const int in_h = out_h / block_size;
    391           const int in_b = out_b;
    392 
    393           const int output_index =
    394               Offset(output_dims, out_d, out_w, out_h, out_b);
    395           const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
    396 
    397           output_data[output_index] = input_data[input_index];
    398         }
    399       }
    400     }
    401   }
    402 }
    403 
    404 template <typename T>
    405 inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
    406                          int block_size, T* output_data,
    407                          const Dims<4>& output_dims) {
    408   const int input_depth = ArraySize(input_dims, 0);
    409   const int input_width = ArraySize(input_dims, 1);
    410   const int input_height = ArraySize(input_dims, 2);
    411   const int input_batch = ArraySize(input_dims, 3);
    412 
    413   const int output_depth = ArraySize(output_dims, 0);
    414   const int output_width = ArraySize(output_dims, 1);
    415   const int output_height = ArraySize(output_dims, 2);
    416   const int output_batch = ArraySize(output_dims, 3);
    417 
    418   TFLITE_DCHECK_EQ(input_width, output_width * block_size);
    419   TFLITE_DCHECK_EQ(input_height, output_height * block_size);
    420   TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
    421   TFLITE_DCHECK_EQ(input_batch, output_batch);
    422 
    423   for (int in_b = 0; in_b < input_batch; ++in_b) {
    424     for (int in_h = 0; in_h < input_height; ++in_h) {
    425       for (int in_w = 0; in_w < input_width; ++in_w) {
    426         for (int in_d = 0; in_d < input_depth; ++in_d) {
    427           const int out_d =
    428               in_d + ((in_h % block_size) * block_size + in_w % block_size) *
    429                          input_depth;
    430           const int out_w = in_w / block_size;
    431           const int out_h = in_h / block_size;
    432           const int out_b = in_b;
    433 
    434           const int output_index =
    435               Offset(output_dims, out_d, out_w, out_h, out_b);
    436           const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
    437 
    438           output_data[output_index] = input_data[input_index];
    439         }
    440       }
    441     }
    442   }
    443 }
    444 
    445 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
    446                            const float* weights_data,
    447                            const Dims<4>& weights_dims, const float* bias_data,
    448                            const Dims<4>& bias_dims,
    449                            float output_activation_min,
    450                            float output_activation_max, float* output_data,
    451                            const Dims<4>& output_dims) {
    452   // TODO(benoitjacob): This really should be:
    453   //     const int batches = ArraySize(output_dims, 1);
    454   // but the current --variable_batch hack consists in overwriting the 3rd
    455   // dimension with the runtime batch size, as we don't keep track for each
    456   // array of which dimension is the batch dimension in it.
    457   const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
    458                       ArraySize(output_dims, 3);
    459   const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
    460   const int accum_depth = ArraySize(weights_dims, 0);
    461   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
    462   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
    463   for (int b = 0; b < batches; ++b) {
    464     for (int out_c = 0; out_c < output_depth; ++out_c) {
    465       float total = 0.f;
    466       for (int d = 0; d < accum_depth; ++d) {
    467         total += input_data[b * accum_depth + d] *
    468                  weights_data[out_c * accum_depth + d];
    469       }
    470       float bias_value = 0.0f;
    471       if (bias_data) {
    472         bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
    473       }
    474       output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
    475           total + bias_value, output_activation_min, output_activation_max);
    476     }
    477   }
    478 }
    479 
    480 // legacy, for compatibility with old checked-in code
    481 template <FusedActivationFunctionType Ac>
    482 void FullyConnected(const float* input_data, const Dims<4>& input_dims,
    483                     const float* weights_data, const Dims<4>& weights_dims,
    484                     const float* bias_data, const Dims<4>& bias_dims,
    485                     float* output_data, const Dims<4>& output_dims) {
    486   float output_activation_min, output_activation_max;
    487   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
    488   FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
    489                  bias_dims, output_activation_min, output_activation_max,
    490                  output_data, output_dims);
    491 }
    492 
    493 inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
    494                            int32 input_offset, const uint8* filter_data,
    495                            const Dims<4>& filter_dims, int32 filter_offset,
    496                            const int32* bias_data, const Dims<4>& bias_dims,
    497                            int32 output_offset, int32 output_multiplier,
    498                            int output_shift, int32 output_activation_min,
    499                            int32 output_activation_max, uint8* output_data,
    500                            const Dims<4>& output_dims,
    501                            gemmlowp::GemmContext* gemm_context) {
    502   (void)gemm_context;  // only used in optimized code.
    503   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
    504   // TODO(benoitjacob): This really should be:
    505   //     const int batches = ArraySize(output_dims, 1);
    506   // but the current --variable_batch hack consists in overwriting the 3rd
    507   // dimension with the runtime batch size, as we don't keep track for each
    508   // array of which dimension is the batch dimension in it.
    509   const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
    510                       ArraySize(output_dims, 3);
    511   const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
    512   const int accum_depth = ArraySize(filter_dims, 0);
    513   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
    514   TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
    515   for (int b = 0; b < batches; ++b) {
    516     for (int out_c = 0; out_c < output_depth; ++out_c) {
    517       int32 acc = 0;
    518       for (int d = 0; d < accum_depth; ++d) {
    519         int32 input_val = input_data[b * accum_depth + d];
    520         int32 filter_val = filter_data[out_c * accum_depth + d];
    521         acc += (filter_val + filter_offset) * (input_val + input_offset);
    522       }
    523       if (bias_data) {
    524         acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
    525       }
    526       acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
    527                                                         output_shift);
    528       acc += output_offset;
    529       acc = std::max(acc, output_activation_min);
    530       acc = std::min(acc, output_activation_max);
    531       output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
    532     }
    533   }
    534 }
    535 
    536 // legacy, for compatibility with old checked-in code
    537 template <FusedActivationFunctionType Ac>
    538 void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
    539                     int32 input_offset, const uint8* filter_data,
    540                     const Dims<4>& filter_dims, int32 filter_offset,
    541                     const int32* bias_data, const Dims<4>& bias_dims,
    542                     int32 output_offset, int32 output_multiplier,
    543                     int output_shift, int32 output_activation_min,
    544                     int32 output_activation_max, uint8* output_data,
    545                     const Dims<4>& output_dims,
    546                     gemmlowp::GemmContext* gemm_context) {
    547   static_assert(Ac == FusedActivationFunctionType::kNone ||
    548                     Ac == FusedActivationFunctionType::kRelu ||
    549                     Ac == FusedActivationFunctionType::kRelu6 ||
    550                     Ac == FusedActivationFunctionType::kRelu1,
    551                 "");
    552   if (Ac == FusedActivationFunctionType::kNone) {
    553     TFLITE_DCHECK_EQ(output_activation_min, 0);
    554     TFLITE_DCHECK_EQ(output_activation_max, 255);
    555   }
    556   FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
    557                  filter_offset, bias_data, bias_dims, output_offset,
    558                  output_multiplier, output_shift, output_activation_min,
    559                  output_activation_max, output_data, output_dims, gemm_context);
    560 }
    561 
    562 template <FusedActivationFunctionType Ac>
    563 void NonGlobalBatchNormalization(
    564     const float* input_data, const Dims<4>& input_dims, const float* mean_data,
    565     const Dims<4>& mean_dims, const float* multiplier_data,
    566     const Dims<4>& multiplier_dims, const float* offset_data,
    567     const Dims<4>& offset_dims, float* output_data,
    568     const Dims<4>& output_dims) {
    569   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    570   const int height =
    571       MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
    572                         offset_dims, 2, output_dims, 2);
    573   const int width =
    574       MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
    575                         offset_dims, 1, output_dims, 1);
    576   const int depth =
    577       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
    578                         offset_dims, 0, output_dims, 0);
    579 
    580   for (int b = 0; b < batches; ++b) {
    581     for (int y = 0; y < height; ++y) {
    582       for (int x = 0; x < width; ++x) {
    583         for (int c = 0; c < depth; ++c) {
    584           output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
    585               (input_data[Offset(input_dims, c, x, y, b)] -
    586                mean_data[Offset(mean_dims, c, x, y, 0)]) *
    587                   multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
    588               offset_data[Offset(offset_dims, c, x, y, 0)]);
    589         }
    590       }
    591     }
    592   }
    593 }
    594 
    595 template <FusedActivationFunctionType Ac>
    596 void GlobalBatchNormalization(const float* input_data,
    597                               const Dims<4>& input_dims, const float* mean_data,
    598                               const Dims<4>& mean_dims,
    599                               const float* multiplier_data,
    600                               const Dims<4>& multiplier_dims,
    601                               const float* offset_data,
    602                               const Dims<4>& offset_dims, float* output_data,
    603                               const Dims<4>& output_dims) {
    604   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    605   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
    606   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
    607   const int depth =
    608       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
    609                         offset_dims, 0, output_dims, 0);
    610 
    611   for (int b = 0; b < batches; ++b) {
    612     for (int y = 0; y < height; ++y) {
    613       for (int x = 0; x < width; ++x) {
    614         for (int c = 0; c < depth; ++c) {
    615           output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
    616               (input_data[Offset(input_dims, c, x, y, b)] -
    617                mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
    618                   multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
    619               offset_data[Offset(offset_dims, c, 0, 0, 0)]);
    620         }
    621       }
    622     }
    623   }
    624 }
    625 
    626 inline void Relu(const float* input_data, const Dims<4>& input_dims,
    627                  float* output_data, const Dims<4>& output_dims) {
    628   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    629   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
    630   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
    631   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
    632   for (int b = 0; b < batches; ++b) {
    633     for (int y = 0; y < height; ++y) {
    634       for (int x = 0; x < width; ++x) {
    635         for (int c = 0; c < depth; ++c) {
    636           float val = input_data[Offset(input_dims, c, x, y, b)];
    637           const float lower = 0;
    638           float clamped = val < lower ? lower : val;
    639           output_data[Offset(output_dims, c, x, y, b)] = clamped;
    640         }
    641       }
    642     }
    643   }
    644 }
    645 
    646 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
    647                   float* output_data, const Dims<4>& output_dims) {
    648   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    649   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
    650   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
    651   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
    652   for (int b = 0; b < batches; ++b) {
    653     for (int y = 0; y < height; ++y) {
    654       for (int x = 0; x < width; ++x) {
    655         for (int c = 0; c < depth; ++c) {
    656           float val = input_data[Offset(input_dims, c, x, y, b)];
    657           const float upper = 1;
    658           const float lower = -1;
    659           float clamped = val > upper ? upper : val < lower ? lower : val;
    660           output_data[Offset(output_dims, c, x, y, b)] = clamped;
    661         }
    662       }
    663     }
    664   }
    665 }
    666 
    667 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
    668                   float* output_data, const Dims<4>& output_dims) {
    669   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    670   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
    671   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
    672   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
    673   for (int b = 0; b < batches; ++b) {
    674     for (int y = 0; y < height; ++y) {
    675       for (int x = 0; x < width; ++x) {
    676         for (int c = 0; c < depth; ++c) {
    677           float val = input_data[Offset(input_dims, c, x, y, b)];
    678           const float upper = 6;
    679           const float lower = 0;
    680           float clamped = val > upper ? upper : val < lower ? lower : val;
    681           output_data[Offset(output_dims, c, x, y, b)] = clamped;
    682         }
    683       }
    684     }
    685   }
    686 }
    687 
    688 template <FusedActivationFunctionType Ac>
    689 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
    690                      float* output_data, const Dims<4>& output_dims) {
    691   static_assert(Ac == FusedActivationFunctionType::kNone, "");
    692   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    693   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
    694   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
    695   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
    696   for (int b = 0; b < batches; ++b) {
    697     for (int y = 0; y < height; ++y) {
    698       for (int x = 0; x < width; ++x) {
    699         float squared_l2_norm = 0;
    700         for (int c = 0; c < depth; ++c) {
    701           float val = input_data[Offset(input_dims, c, x, y, b)];
    702           squared_l2_norm += val * val;
    703         }
    704         float l2_norm = std::sqrt(squared_l2_norm);
    705         for (int c = 0; c < depth; ++c) {
    706           output_data[Offset(output_dims, c, x, y, b)] =
    707               input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
    708         }
    709       }
    710     }
    711   }
    712 }
    713 
    714 inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
    715                                           int* output_shift) {
    716   *output_shift = 11;
    717   while (input >= (1 << 29)) {
    718     input /= 4;
    719     ++*output_shift;
    720   }
    721   TFLITE_DCHECK_GT(input, 0);
    722   const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
    723   const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
    724   const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
    725   *output_shift -= left_shift_bit_pairs;
    726   input <<= 2 * left_shift_bit_pairs;
    727   TFLITE_DCHECK_GE(input, (1 << 27));
    728   TFLITE_DCHECK_LT(input, (1 << 29));
    729   using gemmlowp::FixedPoint;
    730   using gemmlowp::Rescale;
    731   using gemmlowp::SaturatingRoundingMultiplyByPOT;
    732   // Using 3 integer bits gives us enough room for the internal arithmetic in
    733   // this Newton-Raphson iteration.
    734   using F3 = FixedPoint<int32, 3>;
    735   using F0 = FixedPoint<int32, 0>;
    736   const F3 fixedpoint_input = F3::FromRaw(input >> 1);
    737   const F3 fixedpoint_half_input =
    738       SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
    739   const F3 fixedpoint_half_three =
    740       GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
    741   // Newton-Raphson iteration
    742   // Naive unoptimized starting guess: x = 1
    743   F3 x = F3::One();
    744   // Naive unoptimized number of iterations: 5
    745   for (int i = 0; i < 5; i++) {
    746     const F3 x3 = Rescale<3>(x * x * x);
    747     x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
    748   }
    749   const F0 fixedpoint_half_sqrt_2 =
    750       GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
    751   x = x * fixedpoint_half_sqrt_2;
    752   *output_inv_sqrt = x.raw();
    753   if (*output_shift < 0) {
    754     *output_inv_sqrt <<= -*output_shift;
    755     *output_shift = 0;
    756   }
    757 }
    758 
    759 inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
    760                             int32 input_zero_point, uint8* output_data,
    761                             const Dims<4>& output_dims) {
    762   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    763   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
    764   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
    765   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
    766   TFLITE_DCHECK_EQ(batches, 1);
    767   TFLITE_DCHECK_EQ(height, 1);
    768   TFLITE_DCHECK_EQ(width, 1);
    769   int32 square_l2_norm = 0;
    770   for (int i = 0; i < depth; i++) {
    771     int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
    772     square_l2_norm += diff * diff;
    773   }
    774   int32 inv_l2norm_multiplier;
    775   int inv_l2norm_shift;
    776   GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
    777                                 &inv_l2norm_shift);
    778 
    779   for (int i = 0; i < depth; i++) {
    780     int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
    781     int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
    782         128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
    783     int32 unclamped_output_val = 128 + rescaled_diff;
    784     int32 output_val = std::min(255, std::max(0, unclamped_output_val));
    785     output_data[Offset(output_dims, i, 0, 0, 0)] =
    786         static_cast<uint8>(output_val);
    787   }
    788 }
    789 
    790 inline void Add(const float* input1_data, const Dims<4>& input1_dims,
    791                 const float* input2_data, const Dims<4>& input2_dims,
    792                 float output_activation_min, float output_activation_max,
    793                 float* output_data, const Dims<4>& output_dims) {
    794   const int batches =
    795       MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
    796   const int height =
    797       MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
    798   const int width =
    799       MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
    800   const int depth =
    801       MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
    802   for (int b = 0; b < batches; ++b) {
    803     for (int y = 0; y < height; ++y) {
    804       for (int x = 0; x < width; ++x) {
    805         for (int c = 0; c < depth; ++c) {
    806           output_data[Offset(output_dims, c, x, y, b)] =
    807               ActivationFunctionWithMinMax(
    808                   input1_data[Offset(input1_dims, c, x, y, b)] +
    809                       input2_data[Offset(input2_dims, c, x, y, b)],
    810                   output_activation_min, output_activation_max);
    811         }
    812       }
    813     }
    814   }
    815 }
    816 
    817 // legacy, for compatibility with old checked-in code
    818 template <FusedActivationFunctionType Ac>
    819 void Add(const float* input1_data, const Dims<4>& input1_dims,
    820          const float* input2_data, const Dims<4>& input2_dims,
    821          float* output_data, const Dims<4>& output_dims) {
    822   float output_activation_min, output_activation_max;
    823   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
    824 
    825   Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
    826       output_activation_max, output_data, output_dims);
    827 }
    828 
    829 template <FusedActivationFunctionType Ac>
    830 inline void Add(int left_shift, const uint8* input1_data,
    831                 const Dims<4>& input1_dims, int32 input1_offset,
    832                 int32 input1_multiplier, int input1_shift,
    833                 const uint8* input2_data, const Dims<4>& input2_dims,
    834                 int32 input2_offset, int32 input2_multiplier, int input2_shift,
    835                 int32 output_offset, int32 output_multiplier, int output_shift,
    836                 int32 output_activation_min, int32 output_activation_max,
    837                 uint8* output_data, const Dims<4>& output_dims) {
    838   static_assert(Ac == FusedActivationFunctionType::kNone ||
    839                     Ac == FusedActivationFunctionType::kRelu ||
    840                     Ac == FusedActivationFunctionType::kRelu6 ||
    841                     Ac == FusedActivationFunctionType::kRelu1,
    842                 "");
    843   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
    844   if (Ac == FusedActivationFunctionType::kNone) {
    845     TFLITE_DCHECK_EQ(output_activation_min, 0);
    846     TFLITE_DCHECK_EQ(output_activation_max, 255);
    847   }
    848   const int batches =
    849       MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
    850   const int height =
    851       MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
    852   const int width =
    853       MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
    854   const int depth =
    855       MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
    856   for (int b = 0; b < batches; ++b) {
    857     for (int y = 0; y < height; ++y) {
    858       for (int x = 0; x < width; ++x) {
    859         for (int c = 0; c < depth; ++c) {
    860           const int32 input1_val =
    861               input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
    862           const int32 input2_val =
    863               input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
    864           const int32 shifted_input1_val = input1_val * (1 << left_shift);
    865           const int32 shifted_input2_val = input2_val * (1 << left_shift);
    866           const int32 scaled_input1_val =
    867               MultiplyByQuantizedMultiplierSmallerThanOne(
    868                   shifted_input1_val, input1_multiplier, input1_shift);
    869           const int32 scaled_input2_val =
    870               MultiplyByQuantizedMultiplierSmallerThanOne(
    871                   shifted_input2_val, input2_multiplier, input2_shift);
    872           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
    873           const int32 raw_output =
    874               MultiplyByQuantizedMultiplierSmallerThanOne(
    875                   raw_sum, output_multiplier, output_shift) +
    876               output_offset;
    877           const int32 clamped_output =
    878               std::min(output_activation_max,
    879                        std::max(output_activation_min, raw_output));
    880           output_data[Offset(output_dims, c, x, y, b)] =
    881               static_cast<uint8>(clamped_output);
    882         }
    883       }
    884     }
    885   }
    886 }
    887 
    888 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
    889 // dimensionality if the runtime code does a single loop over one dimension
    890 // that handles broadcasting as the base case. The code generator would then
    891 // generate max(D1, D2) nested for loops.
    892 template <typename T>
    893 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
    894                   const T* input2_data, const Dims<4>& input2_dims,
    895                   T output_activation_min, T output_activation_max,
    896                   T* output_data, const Dims<4>& output_dims) {
    897   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
    898 
    899   NdArrayDesc<4> desc1;
    900   NdArrayDesc<4> desc2;
    901   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
    902 
    903   // In Tensorflow, the dimensions are canonically named (batch_number, row,
    904   // col, channel), with extents (batches, height, width, depth), with the
    905   // trailing dimension changing most rapidly (channels has the smallest stride,
    906   // typically 1 element).
    907   //
    908   // In generated C code, we store arrays with the dimensions reversed. The
    909   // first dimension has smallest stride.
    910   //
    911   // We name our variables by their Tensorflow convention, but generate C code
    912   // nesting loops such that the innermost loop has the smallest stride for the
    913   // best cache behavior.
    914   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
    915     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
    916       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
    917         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
    918           output_data[Offset(output_dims, c, x, y, b)] =
    919               ActivationFunctionWithMinMax(
    920                   input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
    921                       input2_data[SubscriptToIndex(desc2, c, x, y, b)],
    922                   output_activation_min, output_activation_max);
    923         }
    924       }
    925     }
    926   }
    927 }
    928 
    929 // legacy, for compatibility with old checked-in code
    930 template <FusedActivationFunctionType Ac, typename T>
    931 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
    932                   const T* input2_data, const Dims<4>& input2_dims,
    933                   T* output_data, const Dims<4>& output_dims) {
    934   T output_activation_min, output_activation_max;
    935   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
    936 
    937   BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
    938                output_activation_min, output_activation_max, output_data,
    939                output_dims);
    940 }
    941 
    942 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
    943                          const Dims<4>& input1_dims, int32 input1_offset,
    944                          int32 input1_multiplier, int input1_shift,
    945                          const uint8* input2_data, const Dims<4>& input2_dims,
    946                          int32 input2_offset, int32 input2_multiplier,
    947                          int input2_shift, int32 output_offset,
    948                          int32 output_multiplier, int output_shift,
    949                          int32 output_activation_min,
    950                          int32 output_activation_max, uint8* output_data,
    951                          const Dims<4>& output_dims) {
    952   gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
    953 
    954   NdArrayDesc<4> desc1;
    955   NdArrayDesc<4> desc2;
    956   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
    957 
    958   // In Tensorflow, the dimensions are canonically named (batch_number, row,
    959   // col, channel), with extents (batches, height, width, depth), with the
    960   // trailing dimension changing most rapidly (channels has the smallest stride,
    961   // typically 1 element).
    962   //
    963   // In generated C code, we store arrays with the dimensions reversed. The
    964   // first dimension has smallest stride.
    965   //
    966   // We name our variables by their Tensorflow convention, but generate C code
    967   // nesting loops such that the innermost loop has the smallest stride for the
    968   // best cache behavior.
    969   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
    970     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
    971       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
    972         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
    973           const int32 input1_val =
    974               input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
    975           const int32 input2_val =
    976               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
    977           const int32 shifted_input1_val = input1_val * (1 << left_shift);
    978           const int32 shifted_input2_val = input2_val * (1 << left_shift);
    979           const int32 scaled_input1_val =
    980               MultiplyByQuantizedMultiplierSmallerThanOne(
    981                   shifted_input1_val, input1_multiplier, input1_shift);
    982           const int32 scaled_input2_val =
    983               MultiplyByQuantizedMultiplierSmallerThanOne(
    984                   shifted_input2_val, input2_multiplier, input2_shift);
    985           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
    986           const int32 raw_output =
    987               MultiplyByQuantizedMultiplierSmallerThanOne(
    988                   raw_sum, output_multiplier, output_shift) +
    989               output_offset;
    990           const int32 clamped_output =
    991               std::min(output_activation_max,
    992                        std::max(output_activation_min, raw_output));
    993           output_data[Offset(output_dims, c, x, y, b)] =
    994               static_cast<uint8>(clamped_output);
    995         }
    996       }
    997     }
    998   }
    999 }
   1000 
   1001 template <FusedActivationFunctionType Ac>
   1002 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
   1003                          const Dims<4>& input1_dims, int32 input1_offset,
   1004                          int32 input1_multiplier, int input1_shift,
   1005                          const uint8* input2_data, const Dims<4>& input2_dims,
   1006                          int32 input2_offset, int32 input2_multiplier,
   1007                          int input2_shift, int32 output_offset,
   1008                          int32 output_multiplier, int output_shift,
   1009                          int32 output_activation_min,
   1010                          int32 output_activation_max, uint8* output_data,
   1011                          const Dims<4>& output_dims) {
   1012   static_assert(Ac == FusedActivationFunctionType::kNone ||
   1013                     Ac == FusedActivationFunctionType::kRelu ||
   1014                     Ac == FusedActivationFunctionType::kRelu6 ||
   1015                     Ac == FusedActivationFunctionType::kRelu1,
   1016                 "");
   1017   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   1018   if (Ac == FusedActivationFunctionType::kNone) {
   1019     TFLITE_DCHECK_EQ(output_activation_min, 0);
   1020     TFLITE_DCHECK_EQ(output_activation_max, 255);
   1021   }
   1022   BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
   1023                input1_multiplier, input1_shift, input2_data, input2_dims,
   1024                input2_offset, input2_multiplier, input2_shift, output_offset,
   1025                output_multiplier, output_shift, output_activation_min,
   1026                output_activation_max, output_data, output_dims);
   1027 }
   1028 
   1029 inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
   1030                 const float* input2_data, const Dims<4>& input2_dims,
   1031                 float output_activation_min, float output_activation_max,
   1032                 float* output_data, const Dims<4>& output_dims) {
   1033   const int batches =
   1034       MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
   1035   const int height =
   1036       MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
   1037   const int width =
   1038       MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
   1039   const int depth =
   1040       MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
   1041   for (int b = 0; b < batches; ++b) {
   1042     for (int y = 0; y < height; ++y) {
   1043       for (int x = 0; x < width; ++x) {
   1044         for (int c = 0; c < depth; ++c) {
   1045           output_data[Offset(output_dims, c, x, y, b)] =
   1046               ActivationFunctionWithMinMax(
   1047                   input1_data[Offset(input1_dims, c, x, y, b)] *
   1048                       input2_data[Offset(input2_dims, c, x, y, b)],
   1049                   output_activation_min, output_activation_max);
   1050         }
   1051       }
   1052     }
   1053   }
   1054 }
   1055 
   1056 // legacy, for compatibility with old checked-in code
   1057 template <FusedActivationFunctionType Ac>
   1058 void Mul(const float* input1_data, const Dims<4>& input1_dims,
   1059          const float* input2_data, const Dims<4>& input2_dims,
   1060          float* output_data, const Dims<4>& output_dims) {
   1061   float output_activation_min, output_activation_max;
   1062   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   1063 
   1064   Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
   1065       output_activation_max, output_data, output_dims);
   1066 }
   1067 
   1068 // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
   1069 // dimensionality if the runtime code does a single loop over one dimension
   1070 // that handles broadcasting as the base case. The code generator would then
   1071 // generate max(D1, D2) nested for loops.
   1072 template <typename T>
   1073 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
   1074                   const T* input2_data, const Dims<4>& input2_dims,
   1075                   T output_activation_min, T output_activation_max,
   1076                   T* output_data, const Dims<4>& output_dims) {
   1077   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
   1078 
   1079   NdArrayDesc<4> desc1;
   1080   NdArrayDesc<4> desc2;
   1081   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
   1082 
   1083   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   1084   // col, channel), with extents (batches, height, width, depth), with the
   1085   // trailing dimension changing most rapidly (channels has the smallest
   1086   // stride, typically 1 element).
   1087   //
   1088   // In generated C code, we store arrays with the dimensions reversed. The
   1089   // first dimension has smallest stride.
   1090   //
   1091   // We name our variables by their Tensorflow convention, but generate C code
   1092   // nesting loops such that the innermost loop has the smallest stride for
   1093   // the best cache behavior.
   1094   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
   1095     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
   1096       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
   1097         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
   1098           output_data[Offset(output_dims, c, x, y, b)] =
   1099               ActivationFunctionWithMinMax(
   1100                   input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
   1101                       input2_data[SubscriptToIndex(desc2, c, x, y, b)],
   1102                   output_activation_min, output_activation_max);
   1103         }
   1104       }
   1105     }
   1106   }
   1107 }
   1108 
   1109 // legacy, for compatibility with old checked-in code
   1110 template <FusedActivationFunctionType Ac, typename T>
   1111 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
   1112                   const T* input2_data, const Dims<4>& input2_dims,
   1113                   T* output_data, const Dims<4>& output_dims) {
   1114   T output_activation_min, output_activation_max;
   1115   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   1116 
   1117   BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
   1118                output_activation_min, output_activation_max, output_data,
   1119                output_dims);
   1120 }
   1121 
   1122 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
   1123                          int32 input1_offset, const uint8* input2_data,
   1124                          const Dims<4>& input2_dims, int32 input2_offset,
   1125                          int32 output_offset, int32 output_multiplier,
   1126                          int output_shift, int32 output_activation_min,
   1127                          int32 output_activation_max, uint8* output_data,
   1128                          const Dims<4>& output_dims) {
   1129   gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
   1130 
   1131   NdArrayDesc<4> desc1;
   1132   NdArrayDesc<4> desc2;
   1133   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
   1134 
   1135   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   1136   // col, channel), with extents (batches, height, width, depth), with the
   1137   // trailing dimension changing most rapidly (channels has the smallest
   1138   // stride, typically 1 element).
   1139   //
   1140   // In generated C code, we store arrays with the dimensions reversed. The
   1141   // first dimension has smallest stride.
   1142   //
   1143   // We name our variables by their Tensorflow convention, but generate C code
   1144   // nesting loops such that the innermost loop has the smallest stride for
   1145   // the best cache behavior.
   1146   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
   1147     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
   1148       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
   1149         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
   1150           const int32 input1_val =
   1151               input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
   1152           const int32 input2_val =
   1153               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
   1154           const int32 unclamped_result =
   1155               output_offset +
   1156               MultiplyByQuantizedMultiplierSmallerThanOne(
   1157                   input1_val * input2_val, output_multiplier, output_shift);
   1158           const int32 clamped_output =
   1159               std::min(output_activation_max,
   1160                        std::max(output_activation_min, unclamped_result));
   1161           output_data[Offset(output_dims, c, x, y, b)] =
   1162               static_cast<uint8>(clamped_output);
   1163         }
   1164       }
   1165     }
   1166   }
   1167 }
   1168 
   1169 // legacy, for compatibility with old checked-in code
   1170 template <FusedActivationFunctionType Ac>
   1171 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
   1172                          int32 input1_offset, const uint8* input2_data,
   1173                          const Dims<4>& input2_dims, int32 input2_offset,
   1174                          int32 output_offset, int32 output_multiplier,
   1175                          int output_shift, int32 output_activation_min,
   1176                          int32 output_activation_max, uint8* output_data,
   1177                          const Dims<4>& output_dims) {
   1178   BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
   1179                input2_dims, input2_offset, output_offset, output_multiplier,
   1180                output_shift, output_activation_min, output_activation_max,
   1181                output_data, output_dims);
   1182 }
   1183 
   1184 inline void Div(const float* input1_data, const Dims<4>& input1_dims,
   1185                 const float* input2_data, const Dims<4>& input2_dims,
   1186                 float output_activation_min, float output_activation_max,
   1187                 float* output_data, const Dims<4>& output_dims) {
   1188   const int batches =
   1189       MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
   1190   const int height =
   1191       MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
   1192   const int width =
   1193       MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
   1194   const int depth =
   1195       MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
   1196   for (int b = 0; b < batches; ++b) {
   1197     for (int y = 0; y < height; ++y) {
   1198       for (int x = 0; x < width; ++x) {
   1199         for (int c = 0; c < depth; ++c) {
   1200           output_data[Offset(output_dims, c, x, y, b)] =
   1201               ActivationFunctionWithMinMax(
   1202                   input1_data[Offset(input1_dims, c, x, y, b)] /
   1203                       input2_data[Offset(input2_dims, c, x, y, b)],
   1204                   output_activation_min, output_activation_max);
   1205         }
   1206       }
   1207     }
   1208   }
   1209 }
   1210 
   1211 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
   1212                 const float* input2_data, const Dims<4>& input2_dims,
   1213                 float output_activation_min, float output_activation_max,
   1214                 float* output_data, const Dims<4>& output_dims) {
   1215   const int batches =
   1216       MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
   1217   const int height =
   1218       MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
   1219   const int width =
   1220       MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
   1221   const int depth =
   1222       MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
   1223   for (int b = 0; b < batches; ++b) {
   1224     for (int y = 0; y < height; ++y) {
   1225       for (int x = 0; x < width; ++x) {
   1226         for (int c = 0; c < depth; ++c) {
   1227           output_data[Offset(output_dims, c, x, y, b)] =
   1228               ActivationFunctionWithMinMax(
   1229                   input1_data[Offset(input1_dims, c, x, y, b)] -
   1230                       input2_data[Offset(input2_dims, c, x, y, b)],
   1231                   output_activation_min, output_activation_max);
   1232         }
   1233       }
   1234     }
   1235   }
   1236 }
   1237 
   1238 template <FusedActivationFunctionType Ac, typename Scalar>
   1239 void Concatenation(int concat_dim, const Scalar* const* input_data,
   1240                    const Dims<4>* const* input_dims, int inputs_count,
   1241                    Scalar* output_data, const Dims<4>& output_dims) {
   1242   TFLITE_DCHECK_GT(inputs_count, 1);
   1243   int concat_size = 0;
   1244   for (int i = 0; i < inputs_count; i++) {
   1245     for (int j = 0; j < 4; j++) {
   1246       if (j != concat_dim) {
   1247         MatchingArraySize(*input_dims[i], j, output_dims, j);
   1248       }
   1249     }
   1250     concat_size += ArraySize(*input_dims[i], concat_dim);
   1251   }
   1252   TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
   1253   TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
   1254   int outer_size = 1;
   1255   for (int i = concat_dim + 1; i < 4; i++) {
   1256     outer_size *= output_dims.sizes[i];
   1257   }
   1258   Scalar* output_ptr = output_data;
   1259   for (int k = 0; k < outer_size; k++) {
   1260     for (int i = 0; i < inputs_count; ++i) {
   1261       const int copy_size =
   1262           input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
   1263       memcpy(output_ptr, input_data[i] + k * copy_size,
   1264              copy_size * sizeof(Scalar));
   1265       output_ptr += copy_size;
   1266     }
   1267   }
   1268 }
   1269 
   1270 template <FusedActivationFunctionType Ac, typename Scalar>
   1271 void DepthConcatenation(const Scalar* const* input_data,
   1272                         const Dims<4>* const* input_dims, int inputs_count,
   1273                         Scalar* output_data, const Dims<4>& output_dims) {
   1274   Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
   1275                             output_data, output_dims);
   1276 }
   1277 
   1278 inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
   1279                      const float* prev_activ_data,
   1280                      const Dims<4>& prev_activ_dims, const float* weights_data,
   1281                      const Dims<4>& weights_dims, const float* bias_data,
   1282                      const Dims<4>& bias_dims, const float* prev_state_data,
   1283                      const Dims<4>& prev_state_dims, float* output_state_data,
   1284                      const Dims<4>& output_state_dims, float* output_activ_data,
   1285                      const Dims<4>& output_activ_dims, float* concat_temp_data,
   1286                      const Dims<4>& concat_temp_dims, float* activ_temp_data,
   1287                      const Dims<4>& activ_temp_dims) {
   1288   const int batches =
   1289       MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
   1290                         output_state_dims, 3, output_activ_dims, 3);
   1291   const int height =
   1292       MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
   1293                         output_state_dims, 2, output_activ_dims, 2);
   1294   const int width =
   1295       MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
   1296                         output_state_dims, 1, output_activ_dims, 1);
   1297   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   1298   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   1299   const int input_depth = ArraySize(input_dims, 0);
   1300   const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
   1301   const int total_input_depth = prev_activ_depth + input_depth;
   1302   TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
   1303   TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
   1304                   1);
   1305   const int intern_activ_depth =
   1306       MatchingArraySize(weights_dims, 1, bias_dims, 0);
   1307   TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
   1308   const int output_depth =
   1309       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
   1310                         output_state_dims, 0, output_activ_dims, 0);
   1311   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
   1312 
   1313   // Concatenate prev_activ and input data together
   1314   std::vector<float const*> concat_input_arrays_data;
   1315   std::vector<Dims<4> const*> concat_input_arrays_dims;
   1316   concat_input_arrays_data.push_back(input_data);
   1317   concat_input_arrays_data.push_back(prev_activ_data);
   1318   concat_input_arrays_dims.push_back(&input_dims);
   1319   concat_input_arrays_dims.push_back(&prev_activ_dims);
   1320   Concatenation<FusedActivationFunctionType::kNone, float>(
   1321       0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
   1322       concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
   1323 
   1324   // Fully connected
   1325   FullyConnected<FusedActivationFunctionType::kNone>(
   1326       concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
   1327       bias_dims, activ_temp_data, activ_temp_dims);
   1328 
   1329   // Memory state update (the LSTM "guts")
   1330   for (int b = 0; b < batches; ++b) {
   1331     for (int w = 0; w < width; ++w) {
   1332       for (int h = 0; h < height; ++h) {
   1333         for (int c = 0; c < output_depth; ++c) {
   1334           const float input_gate =
   1335               1.f /
   1336               (1.f + std::exp(-activ_temp_data[Offset(
   1337                          activ_temp_dims, 0 * output_depth + c, w, h, b)]));
   1338           const float new_input = std::tanh(activ_temp_data[Offset(
   1339               activ_temp_dims, 1 * output_depth + c, w, h, b)]);
   1340           const float forget_gate =
   1341               1.f /
   1342               (1.f + std::exp(-activ_temp_data[Offset(
   1343                          activ_temp_dims, 2 * output_depth + c, w, h, b)]));
   1344           const float output_gate =
   1345               1.f /
   1346               (1.f + std::exp(-activ_temp_data[Offset(
   1347                          activ_temp_dims, 3 * output_depth + c, w, h, b)]));
   1348           const float new_state =
   1349               input_gate * new_input +
   1350               forget_gate *
   1351                   prev_state_data[Offset(prev_state_dims, c, w, h, b)];
   1352           output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state;
   1353           output_activ_data[Offset(output_activ_dims, c, w, h, b)] =
   1354               output_gate * std::tanh(new_state);
   1355         }
   1356       }
   1357     }
   1358   }
   1359 }
   1360 
   1361 // Quantized LSTM cell implementation.
   1362 // The quantization of the input, output arrays is as follows:
   1363 //  - The input activations are quantized as uint8 on the interval
   1364 //    [-1, 127/128].
   1365 //    The rationale for that is that that is the natural interval for output
   1366 //    activations (see next point) and these need to be concatenated together.
   1367 //    We could accommodate different ranges by re-scaling, but we empirically
   1368 //    found that setting the input activations range to be [-1, 127/128] in the
   1369 //    first place, removing the need for re-scaling, greatly improves accuracy.
   1370 //  - The output activations are quantized as uint8 on the interval
   1371 //    [-1, 127/128].
   1372 //    The rationale for that is that the definition of a LSTM cell makes them
   1373 //    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
   1374 //    makes for simpler, more accurate fixed-point arithmetic.
   1375 //  - The output-at-previous-timestep state array is obviously quantized as
   1376 //    the output activations.
   1377 //  - The internal LSTM memory (not the output-at-previous-timestep, the other
   1378 //    internal state array) is int16-quantized and may use any power-of-two,
   1379 //    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
   1380 //    StateIntegerBits below, see the below discussion of that template
   1381 //    parameter ("The StateIntegerBits template parameter").
   1382 //  - The output of the internal fully-connected node is int16-quantized
   1383 //    on the interval [-8, 8 * 32767/32768], the rationale for which is
   1384 //    explained just below ("Why [-8, 8] for fully-connected output?").
   1385 //
   1386 //
   1387 // === The StateIntegerBits template parameter ===
   1388 //
   1389 // The StateIntegerBits template parameter controls the fixed-point format used
   1390 // to represent the internal memory of the LSTM cell (not the
   1391 // output-at-previous-timestep, the other internal state array). It's currently
   1392 // a template parameter so that the model can control that. The most typical
   1393 // value for StateIntegerBits is 4. Other plausible values are anywhere between
   1394 // 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
   1395 // and drop that template parameter. The reason why it can't be a runtime
   1396 // parameter is that this controls the fixed-point format used, i.e. we need to
   1397 // generate actually different code based on it. In particular, we generate code
   1398 // for a fixed-point tanh() implementation for that format, which internally
   1399 // uses a fixed-point exp() implementation, which internally uses a
   1400 // barrel-shifter with a number of steps that depends on StateIntegerBits.
   1401 // Another consequence of that is that a higher value of StateIntegerBits
   1402 // results in a more expensive implementation (more barrel shifter steps
   1403 // needed).
   1404 //
   1405 //
   1406 // === Why [-8, 8] for fully-connected output? ===
   1407 //
   1408 // This array is only fed to Logistic and Tanh functions, for which
   1409 // the quantized implementation will want to use fixed-point arithmetic,
   1410 // requiring a power-of-two representation interval. Thus, we should right
   1411 // away quantize this array to a power-of-two interval; otherwise,
   1412 // implementation will need to rescale that, losing any benefit that a tighter
   1413 // representation interval might otherwise yield, while introducting some
   1414 // numerical error and computational overhead.
   1415 //
   1416 // Now, Logistic and Tanh
   1417 // are nearly constant (nearly equal to their horizontal asymptotes)
   1418 // outside of a small bounded interval around 0:
   1419 //
   1420 //   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
   1421 //   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
   1422 //   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
   1423 //
   1424 // From this, we see that clamping to [-4, 4] would be too inaccurate
   1425 // (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
   1426 // while clamping to [-16, 16] would make no difference even in float32.
   1427 // However, for a fixed-point implementation in 16-bit integers, using 5
   1428 // integer bits to represent the [-16, 16] range would leave only 11
   1429 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
   1430 // representable values. Notice that that is higher than the
   1431 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
   1432 // Using [-8, 8] thus seems like the better compromise overall, enjoying
   1433 // an increment of 2.4e-4 between representable values and a worst-case
   1434 // clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
   1435 // [-16, 16].
   1436 //
   1437 // Moreover, all other things being equal, it is nice to choose the narrower
   1438 // representation range, as that makes the implementation of fixed-point
   1439 // math functions a little cheaper (each integer bit requires an additional
   1440 // barrel-shifter atep in the implementation of exp(-x)). That is further
   1441 // reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
   1442 // sense for 32-bit float or 32-bit fixed-point quantization, but we are
   1443 // aiming for 16-bit fixed-point quantization of these internal nodes here.
   1444 //
   1445 template <int StateIntegerBits>
   1446 void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   1447               const uint8* prev_activ_data_uint8,
   1448               const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
   1449               const Dims<4>& weights_dims, const int32* bias_data_int32,
   1450               const Dims<4>& bias_dims, const int16* prev_state_data_int16,
   1451               const Dims<4>& prev_state_dims, int16* output_state_data_int16,
   1452               const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
   1453               const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
   1454               const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
   1455               const Dims<4>& activ_temp_dims, int32 weights_zero_point,
   1456               int32 accum_multiplier, int accum_shift,
   1457               gemmlowp::GemmContext* gemm_context) {
   1458   (void)gemm_context;  // only used in optimized code.
   1459 
   1460   // Gather dimensions information, and perform consistency checks.
   1461   const int batches =
   1462       MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
   1463                         output_state_dims, 3, output_activ_dims, 3);
   1464   const int height =
   1465       MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
   1466                         output_state_dims, 2, output_activ_dims, 2);
   1467   const int width =
   1468       MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
   1469                         output_state_dims, 1, output_activ_dims, 1);
   1470   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   1471   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   1472   const int input_depth = ArraySize(input_dims, 0);
   1473   const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
   1474   const int total_input_depth = prev_activ_depth + input_depth;
   1475   TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
   1476   TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
   1477                   1);
   1478   const int intern_activ_depth =
   1479       MatchingArraySize(weights_dims, 1, bias_dims, 0);
   1480   TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
   1481   const int output_depth =
   1482       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
   1483                         output_state_dims, 0, output_activ_dims, 0);
   1484   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
   1485   const int fc_batches = ArraySize(activ_temp_dims, 1) *
   1486                          ArraySize(activ_temp_dims, 2) *
   1487                          ArraySize(activ_temp_dims, 3);
   1488   const int fc_output_depth =
   1489       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   1490   const int fc_accum_depth = ArraySize(weights_dims, 0);
   1491   TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth);
   1492 
   1493   // Depth-concatenate prev_activ and input data together.
   1494   uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
   1495                                               prev_activ_data_uint8};
   1496   Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims};
   1497   Concatenation<FusedActivationFunctionType::kNone, uint8>(
   1498       0, concat_input_arrays_data, concat_input_arrays_dims, 2,
   1499       concat_temp_data_uint8, concat_temp_dims);
   1500 
   1501   // Implementation of the fully connected node inside the LSTM cell.
   1502   // The operands are 8-bit integers, the accumulators are internally 32bit
   1503   // integers, and the output is 16-bit fixed-point with 3 integer bits so
   1504   // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
   1505   // is explained in the function comment above.
   1506   for (int b = 0; b < fc_batches; ++b) {
   1507     for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
   1508       // Internal accumulation.
   1509       // Initialize accumulator with the bias-value.
   1510       int32 accum = bias_data_int32[out_c];
   1511       // Accumulation loop.
   1512       for (int d = 0; d < fc_accum_depth; ++d) {
   1513         int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
   1514         int16 weights_val =
   1515             weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
   1516         accum += input_val * weights_val;
   1517       }
   1518       // Down-scale the final int32 accumulator to the scale used by our
   1519       // (16-bit, using 3 integer bits) fixed-point format. The quantized
   1520       // multiplier and shift here have been pre-computed offline
   1521       // (e.g. by toco).
   1522       accum =
   1523           MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
   1524       // Saturate, cast to int16, and store to the temporary activations array.
   1525       accum = std::max(-32768, std::min(32767, accum));
   1526       activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
   1527     }
   1528   }
   1529 
   1530   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   1531   // and muls, all done in 16-bit fixed-point.
   1532   const int outer_size = batches * width * height;
   1533   for (int b = 0; b < outer_size; ++b) {
   1534     for (int c = 0; c < output_depth; ++c) {
   1535       // Define the fixed-point data types that we will use here. All use
   1536       // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
   1537       // They only differ by the number of integral vs. fractional bits,
   1538       // determining the range of values that they can represent.
   1539       //
   1540       // F0 uses 0 integer bits, range [-1, 1].
   1541       // This is the return type of math functions such as tanh, logistic,
   1542       // whose range is in [-1, 1].
   1543       using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
   1544       // F3 uses 3 integer bits, range [-8, 8].
   1545       // This is the range of the previous fully-connected node's output,
   1546       // which is our input here.
   1547       using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
   1548       // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
   1549       // 2^StateIntegerBits]. It's used to represent the internal state, whose
   1550       // number of integer bits is currently dictated by the model. See comment
   1551       // on the StateIntegerBits template parameter above.
   1552       using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
   1553       // Implementation of input gate, using fixed-point logistic function.
   1554       F3 input_gate_input = F3::FromRaw(
   1555           activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
   1556       F0 input_gate_output = gemmlowp::logistic(input_gate_input);
   1557       // Implementation of input modulation gate, using fixed-point tanh
   1558       // function.
   1559       F3 input_modulation_gate_input = F3::FromRaw(
   1560           activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
   1561       F0 input_modulation_gate_output =
   1562           gemmlowp::tanh(input_modulation_gate_input);
   1563       // Implementation of forget gate, using fixed-point logistic function.
   1564       F3 forget_gate_input = F3::FromRaw(
   1565           activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
   1566       F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
   1567       // Implementation of output gate, using fixed-point logistic function.
   1568       F3 output_gate_input = F3::FromRaw(
   1569           activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
   1570       F0 output_gate_output = gemmlowp::logistic(output_gate_input);
   1571       // Implementation of internal multiplication nodes, still in fixed-point.
   1572       F0 input_times_input_modulation =
   1573           input_gate_output * input_modulation_gate_output;
   1574       FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
   1575       FS prev_state_times_forget_state = forget_gate_output * prev_state;
   1576       // Implementation of internal addition node, saturating.
   1577       FS new_state = gemmlowp::SaturatingAdd(
   1578           gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
   1579           prev_state_times_forget_state);
   1580       // Implementation of last internal Tanh node, still in fixed-point.
   1581       // Since a Tanh fixed-point implementation is specialized for a given
   1582       // number or integer bits, and each specialization can have a substantial
   1583       // code size, and we already used above a Tanh on an input with 3 integer
   1584       // bits, and per the table in the above function comment there is no
   1585       // significant accuracy to be lost by clamping to [-8, +8] for a
   1586       // 3-integer-bits representation, let us just do that. This helps people
   1587       // porting this to targets where code footprint must be minimized.
   1588       F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
   1589       F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
   1590       // Store the new internal state back to memory, as 16-bit integers.
   1591       // Note: here we store the original value with StateIntegerBits, not
   1592       // the rescaled 3-integer-bits value fed to tanh.
   1593       output_state_data_int16[b * output_depth + c] = new_state.raw();
   1594       // Down-scale the output activations to 8-bit integers, saturating,
   1595       // and store back to memory.
   1596       int16 rescaled_output_activ =
   1597           gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
   1598       int16 clamped_output_activ =
   1599           std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
   1600       output_activ_data_uint8[b * output_depth + c] =
   1601           128 + clamped_output_activ;
   1602     }
   1603   }
   1604 }
   1605 
   1606 template <typename Scalar>
   1607 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
   1608                      int axis, int outputs_count, Scalar* const* output_data,
   1609                      const Dims<4>* const* output_dims) {
   1610   const int batches = ArraySize(*output_dims[0], 3);
   1611   const int height = ArraySize(*output_dims[0], 2);
   1612   const int width = ArraySize(*output_dims[0], 1);
   1613   const int depth = ArraySize(*output_dims[0], 0);
   1614 
   1615   const int slice_size = ArraySize(*output_dims[0], axis);
   1616 
   1617   for (int i = 0; i < outputs_count; ++i) {
   1618     int offset = i * slice_size * input_dims.strides[axis];
   1619     for (int b = 0; b < batches; ++b) {
   1620       for (int y = 0; y < height; ++y) {
   1621         for (int x = 0; x < width; ++x) {
   1622           for (int c = 0; c < depth; ++c) {
   1623             auto out = Offset(*output_dims[i], c, x, y, b);
   1624             auto in = Offset(input_dims, c, x, y, b);
   1625             output_data[i][out] = input_data[offset + in];
   1626           }
   1627         }
   1628       }
   1629     }
   1630   }
   1631 }
   1632 
   1633 template <FusedActivationFunctionType Ac, typename Scalar>
   1634 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
   1635                      int outputs_count, Scalar* const* output_data,
   1636                      const Dims<4>* const* output_dims) {
   1637   TFLITE_DCHECK_GE(outputs_count, 1);
   1638   for (int i = 0; i < outputs_count; i++) {
   1639     /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
   1640     /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
   1641     /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
   1642   }
   1643   // for now we dont have a model with a TensorFlowSplit
   1644   // with fused activation function.
   1645   TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
   1646 
   1647   TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count,
   1648                   output_data, output_dims);
   1649 }
   1650 
   1651 // TODO(benoitjacob) make this a proper reference impl without Eigen!
   1652 template <typename Scalar>
   1653 using MatrixMap = typename std::conditional<
   1654     std::is_const<Scalar>::value,
   1655     Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
   1656                                    Eigen::Dynamic, Eigen::Dynamic>>,
   1657     Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
   1658 
   1659 template <typename Scalar, int N>
   1660 MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
   1661                                                 const Dims<N>& dims) {
   1662   const int rows = dims.sizes[0];
   1663   int cols = 1;
   1664   for (int d = 1; d < N; d++) {
   1665     cols *= dims.sizes[d];
   1666   }
   1667   return MatrixMap<Scalar>(data, rows, cols);
   1668 }
   1669 
   1670 template <typename Scalar, int N>
   1671 MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
   1672                                                const Dims<N>& dims) {
   1673   const int cols = dims.sizes[N - 1];
   1674   int rows = 1;
   1675   for (int d = 0; d < N - 1; d++) {
   1676     rows *= dims.sizes[d];
   1677   }
   1678   return MatrixMap<Scalar>(data, rows, cols);
   1679 }
   1680 
   1681 inline int NodeOffset(int b, int h, int w, int height, int width) {
   1682   return (b * height + h) * width + w;
   1683 }
   1684 
   1685 inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
   1686                         int stride_width, int stride_height, int pad_width,
   1687                         int pad_height, int filter_width, int filter_height,
   1688                         float output_activation_min,
   1689                         float output_activation_max, float* output_data,
   1690                         const Dims<4>& output_dims) {
   1691   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1692   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   1693   const int input_height = ArraySize(input_dims, 2);
   1694   const int input_width = ArraySize(input_dims, 1);
   1695   const int output_height = ArraySize(output_dims, 2);
   1696   const int output_width = ArraySize(output_dims, 1);
   1697   for (int batch = 0; batch < batches; ++batch) {
   1698     for (int out_y = 0; out_y < output_height; ++out_y) {
   1699       for (int out_x = 0; out_x < output_width; ++out_x) {
   1700         for (int channel = 0; channel < depth; ++channel) {
   1701           const int in_x_origin = (out_x * stride_width) - pad_width;
   1702           const int in_y_origin = (out_y * stride_height) - pad_height;
   1703           // Compute the boundaries of the filter region clamped so as to
   1704           // ensure that the filter window fits in the input array.
   1705           const int filter_x_start = std::max(0, -in_x_origin);
   1706           const int filter_x_end =
   1707               std::min(filter_width, input_width - in_x_origin);
   1708           const int filter_y_start = std::max(0, -in_y_origin);
   1709           const int filter_y_end =
   1710               std::min(filter_height, input_height - in_y_origin);
   1711           float total = 0.f;
   1712           float filter_count = 0;
   1713           for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1714                ++filter_y) {
   1715             for (int filter_x = filter_x_start; filter_x < filter_x_end;
   1716                  ++filter_x) {
   1717               const int in_x = in_x_origin + filter_x;
   1718               const int in_y = in_y_origin + filter_y;
   1719               total +=
   1720                   input_data[Offset(input_dims, channel, in_x, in_y, batch)];
   1721               filter_count++;
   1722             }
   1723           }
   1724           const float average = total / filter_count;
   1725           output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
   1726               ActivationFunctionWithMinMax(average, output_activation_min,
   1727                                            output_activation_max);
   1728         }
   1729       }
   1730     }
   1731   }
   1732 }
   1733 
   1734 // legacy, for compatibility with old checked-in code
   1735 template <FusedActivationFunctionType Ac>
   1736 void AveragePool(const float* input_data, const Dims<4>& input_dims,
   1737                  int stride_width, int stride_height, int pad_width,
   1738                  int pad_height, int filter_width, int filter_height,
   1739                  float* output_data, const Dims<4>& output_dims) {
   1740   float output_activation_min, output_activation_max;
   1741   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   1742   AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
   1743               pad_height, filter_width, filter_height, output_activation_min,
   1744               output_activation_max, output_data, output_dims);
   1745 }
   1746 
   1747 // legacy, for compatibility with old checked-in code
   1748 template <FusedActivationFunctionType Ac>
   1749 void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
   1750                  int pad_width, int pad_height, int filter_width,
   1751                  int filter_height, float* output_data,
   1752                  const Dims<4>& output_dims) {
   1753   AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
   1754                   filter_width, filter_height, output_data, output_dims);
   1755 }
   1756 
   1757 inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
   1758                         int stride_width, int stride_height, int pad_width,
   1759                         int pad_height, int filter_width, int filter_height,
   1760                         int32 output_activation_min,
   1761                         int32 output_activation_max, uint8* output_data,
   1762                         const Dims<4>& output_dims) {
   1763   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   1764   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1765   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   1766   const int input_height = ArraySize(input_dims, 2);
   1767   const int input_width = ArraySize(input_dims, 1);
   1768   const int output_height = ArraySize(output_dims, 2);
   1769   const int output_width = ArraySize(output_dims, 1);
   1770   for (int batch = 0; batch < batches; ++batch) {
   1771     for (int out_y = 0; out_y < output_height; ++out_y) {
   1772       for (int out_x = 0; out_x < output_width; ++out_x) {
   1773         for (int channel = 0; channel < depth; ++channel) {
   1774           const int in_x_origin = (out_x * stride_width) - pad_width;
   1775           const int in_y_origin = (out_y * stride_height) - pad_height;
   1776           // Compute the boundaries of the filter region clamped so as to
   1777           // ensure that the filter window fits in the input array.
   1778           const int filter_x_start = std::max(0, -in_x_origin);
   1779           const int filter_x_end =
   1780               std::min(filter_width, input_width - in_x_origin);
   1781           const int filter_y_start = std::max(0, -in_y_origin);
   1782           const int filter_y_end =
   1783               std::min(filter_height, input_height - in_y_origin);
   1784           int32 acc = 0;
   1785           int filter_count = 0;
   1786           for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1787                ++filter_y) {
   1788             for (int filter_x = filter_x_start; filter_x < filter_x_end;
   1789                  ++filter_x) {
   1790               const int in_x = in_x_origin + filter_x;
   1791               const int in_y = in_y_origin + filter_y;
   1792               acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)];
   1793               filter_count++;
   1794             }
   1795           }
   1796           acc = (acc + filter_count / 2) / filter_count;
   1797           acc = std::max(acc, output_activation_min);
   1798           acc = std::min(acc, output_activation_max);
   1799           output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
   1800               static_cast<uint8>(acc);
   1801         }
   1802       }
   1803     }
   1804   }
   1805 }
   1806 
   1807 // legacy, for compatibility with old checked-in code
   1808 template <FusedActivationFunctionType Ac>
   1809 void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
   1810                  int stride_width, int stride_height, int pad_width,
   1811                  int pad_height, int filter_width, int filter_height,
   1812                  int32 output_activation_min, int32 output_activation_max,
   1813                  uint8* output_data, const Dims<4>& output_dims) {
   1814   static_assert(Ac == FusedActivationFunctionType::kNone ||
   1815                     Ac == FusedActivationFunctionType::kRelu ||
   1816                     Ac == FusedActivationFunctionType::kRelu6 ||
   1817                     Ac == FusedActivationFunctionType::kRelu1,
   1818                 "");
   1819   if (Ac == FusedActivationFunctionType::kNone) {
   1820     TFLITE_DCHECK_EQ(output_activation_min, 0);
   1821     TFLITE_DCHECK_EQ(output_activation_max, 255);
   1822   }
   1823   AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
   1824               pad_height, filter_width, filter_height, output_activation_min,
   1825               output_activation_max, output_data, output_dims);
   1826 }
   1827 
   1828 // legacy, for compatibility with old checked-in code
   1829 template <FusedActivationFunctionType Ac>
   1830 void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
   1831                  int pad_width, int pad_height, int filter_width,
   1832                  int filter_height, int32 output_activation_min,
   1833                  int32 output_activation_max, uint8* output_data,
   1834                  const Dims<4>& output_dims) {
   1835   AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
   1836                   filter_width, filter_height, output_activation_min,
   1837                   output_activation_max, output_data, output_dims);
   1838 }
   1839 
   1840 inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
   1841                    int stride_width, int stride_height, int pad_width,
   1842                    int pad_height, int filter_width, int filter_height,
   1843                    float output_activation_min, float output_activation_max,
   1844                    float* output_data, const Dims<4>& output_dims) {
   1845   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1846   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   1847   const int input_height = ArraySize(input_dims, 2);
   1848   const int input_width = ArraySize(input_dims, 1);
   1849   const int output_height = ArraySize(output_dims, 2);
   1850   const int output_width = ArraySize(output_dims, 1);
   1851   for (int batch = 0; batch < batches; ++batch) {
   1852     for (int out_y = 0; out_y < output_height; ++out_y) {
   1853       for (int out_x = 0; out_x < output_width; ++out_x) {
   1854         for (int channel = 0; channel < depth; ++channel) {
   1855           const int in_x_origin = (out_x * stride_width) - pad_width;
   1856           const int in_y_origin = (out_y * stride_height) - pad_height;
   1857           // Compute the boundaries of the filter region clamped so as to
   1858           // ensure that the filter window fits in the input array.
   1859           const int filter_x_start = std::max(0, -in_x_origin);
   1860           const int filter_x_end =
   1861               std::min(filter_width, input_width - in_x_origin);
   1862           const int filter_y_start = std::max(0, -in_y_origin);
   1863           const int filter_y_end =
   1864               std::min(filter_height, input_height - in_y_origin);
   1865           float sum_squares = 0.f;
   1866           int filter_count = 0;
   1867           for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1868                ++filter_y) {
   1869             for (int filter_x = filter_x_start; filter_x < filter_x_end;
   1870                  ++filter_x) {
   1871               const int in_x = in_x_origin + filter_x;
   1872               const int in_y = in_y_origin + filter_y;
   1873               const float val =
   1874                   input_data[Offset(input_dims, channel, in_x, in_y, batch)];
   1875               sum_squares += val * val;
   1876               filter_count++;
   1877             }
   1878           }
   1879           const float l2pool_result = std::sqrt(sum_squares / filter_count);
   1880           output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
   1881               ActivationFunctionWithMinMax(l2pool_result, output_activation_min,
   1882                                            output_activation_max);
   1883         }
   1884       }
   1885     }
   1886   }
   1887 }
   1888 
   1889 // legacy, for compatibility with old checked-in code
   1890 template <FusedActivationFunctionType Ac>
   1891 void L2Pool(const float* input_data, const Dims<4>& input_dims,
   1892             int stride_width, int stride_height, int pad_width, int pad_height,
   1893             int filter_width, int filter_height, float* output_data,
   1894             const Dims<4>& output_dims) {
   1895   float output_activation_min, output_activation_max;
   1896   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   1897 
   1898   L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
   1899          pad_height, filter_width, filter_height, output_activation_min,
   1900          output_activation_max, output_data, output_dims);
   1901 }
   1902 
   1903 // legacy, for compatibility with old checked-in code
   1904 template <FusedActivationFunctionType Ac>
   1905 void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
   1906             int pad_width, int pad_height, int filter_width, int filter_height,
   1907             float* output_data, const Dims<4>& output_dims) {
   1908   L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
   1909              filter_width, filter_height, output_data, output_dims);
   1910 }
   1911 
   1912 inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
   1913                     int stride_width, int stride_height, int pad_width,
   1914                     int pad_height, int filter_width, int filter_height,
   1915                     float output_activation_min, float output_activation_max,
   1916                     float* output_data, const Dims<4>& output_dims) {
   1917   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1918   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   1919   const int input_height = ArraySize(input_dims, 2);
   1920   const int input_width = ArraySize(input_dims, 1);
   1921   const int output_height = ArraySize(output_dims, 2);
   1922   const int output_width = ArraySize(output_dims, 1);
   1923   for (int batch = 0; batch < batches; ++batch) {
   1924     for (int out_y = 0; out_y < output_height; ++out_y) {
   1925       for (int out_x = 0; out_x < output_width; ++out_x) {
   1926         for (int channel = 0; channel < depth; ++channel) {
   1927           const int in_x_origin = (out_x * stride_width) - pad_width;
   1928           const int in_y_origin = (out_y * stride_height) - pad_height;
   1929           // Compute the boundaries of the filter region clamped so as to
   1930           // ensure that the filter window fits in the input array.
   1931           const int filter_x_start = std::max(0, -in_x_origin);
   1932           const int filter_x_end =
   1933               std::min(filter_width, input_width - in_x_origin);
   1934           const int filter_y_start = std::max(0, -in_y_origin);
   1935           const int filter_y_end =
   1936               std::min(filter_height, input_height - in_y_origin);
   1937           float max = std::numeric_limits<float>::lowest();
   1938           for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1939                ++filter_y) {
   1940             for (int filter_x = filter_x_start; filter_x < filter_x_end;
   1941                  ++filter_x) {
   1942               const int in_x = in_x_origin + filter_x;
   1943               const int in_y = in_y_origin + filter_y;
   1944               max = std::max(
   1945                   max,
   1946                   input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
   1947             }
   1948           }
   1949           output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
   1950               ActivationFunctionWithMinMax(max, output_activation_min,
   1951                                            output_activation_max);
   1952         }
   1953       }
   1954     }
   1955   }
   1956 }
   1957 
   1958 // legacy, for compatibility with old checked-in code
   1959 template <FusedActivationFunctionType Ac>
   1960 void MaxPool(const float* input_data, const Dims<4>& input_dims,
   1961              int stride_width, int stride_height, int pad_width, int pad_height,
   1962              int filter_width, int filter_height, float* output_data,
   1963              const Dims<4>& output_dims) {
   1964   float output_activation_min, output_activation_max;
   1965   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   1966   MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
   1967           pad_height, filter_width, filter_height, output_activation_min,
   1968           output_activation_max, output_data, output_dims);
   1969 }
   1970 
   1971 // legacy, for compatibility with old checked-in code
   1972 template <FusedActivationFunctionType Ac>
   1973 void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
   1974              int pad_width, int pad_height, int filter_width, int filter_height,
   1975              float* output_data, const Dims<4>& output_dims) {
   1976   MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
   1977               filter_width, filter_height, output_data, output_dims);
   1978 }
   1979 
   1980 inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
   1981                     int stride_width, int stride_height, int pad_width,
   1982                     int pad_height, int filter_width, int filter_height,
   1983                     int32 output_activation_min, int32 output_activation_max,
   1984                     uint8* output_data, const Dims<4>& output_dims) {
   1985   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   1986   TFLITE_DCHECK_GE(output_activation_min, 0);
   1987   TFLITE_DCHECK_LE(output_activation_max, 255);
   1988   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1989   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   1990   const int input_height = ArraySize(input_dims, 2);
   1991   const int input_width = ArraySize(input_dims, 1);
   1992   const int output_height = ArraySize(output_dims, 2);
   1993   const int output_width = ArraySize(output_dims, 1);
   1994   for (int batch = 0; batch < batches; ++batch) {
   1995     for (int out_y = 0; out_y < output_height; ++out_y) {
   1996       for (int out_x = 0; out_x < output_width; ++out_x) {
   1997         for (int channel = 0; channel < depth; ++channel) {
   1998           const int in_x_origin = (out_x * stride_width) - pad_width;
   1999           const int in_y_origin = (out_y * stride_height) - pad_height;
   2000           // Compute the boundaries of the filter region clamped so as to
   2001           // ensure that the filter window fits in the input array.
   2002           const int filter_x_start = std::max(0, -in_x_origin);
   2003           const int filter_x_end =
   2004               std::min(filter_width, input_width - in_x_origin);
   2005           const int filter_y_start = std::max(0, -in_y_origin);
   2006           const int filter_y_end =
   2007               std::min(filter_height, input_height - in_y_origin);
   2008           uint8 max = 0;
   2009           for (int filter_y = filter_y_start; filter_y < filter_y_end;
   2010                ++filter_y) {
   2011             for (int filter_x = filter_x_start; filter_x < filter_x_end;
   2012                  ++filter_x) {
   2013               const int in_x = in_x_origin + filter_x;
   2014               const int in_y = in_y_origin + filter_y;
   2015               max = std::max(
   2016                   max,
   2017                   input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
   2018             }
   2019           }
   2020           max = std::max<uint8>(max, output_activation_min);
   2021           max = std::min<uint8>(max, output_activation_max);
   2022           output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
   2023               static_cast<uint8>(max);
   2024         }
   2025       }
   2026     }
   2027   }
   2028 }
   2029 
   2030 // legacy, for compatibility with old checked-in code
   2031 template <FusedActivationFunctionType Ac>
   2032 void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
   2033              int stride_width, int stride_height, int pad_width, int pad_height,
   2034              int filter_width, int filter_height, int32 output_activation_min,
   2035              int32 output_activation_max, uint8* output_data,
   2036              const Dims<4>& output_dims) {
   2037   static_assert(Ac == FusedActivationFunctionType::kNone ||
   2038                     Ac == FusedActivationFunctionType::kRelu ||
   2039                     Ac == FusedActivationFunctionType::kRelu6 ||
   2040                     Ac == FusedActivationFunctionType::kRelu1,
   2041                 "");
   2042   if (Ac == FusedActivationFunctionType::kNone) {
   2043     TFLITE_DCHECK_EQ(output_activation_min, 0);
   2044     TFLITE_DCHECK_EQ(output_activation_max, 255);
   2045   }
   2046   MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
   2047           pad_height, filter_width, filter_height, output_activation_min,
   2048           output_activation_max, output_data, output_dims);
   2049 }
   2050 
   2051 // legacy, for compatibility with old checked-in code
   2052 template <FusedActivationFunctionType Ac>
   2053 void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
   2054              int pad_width, int pad_height, int filter_width, int filter_height,
   2055              int32 output_activation_min, int32 output_activation_max,
   2056              uint8* output_data, const Dims<4>& output_dims) {
   2057   MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
   2058               filter_width, filter_height, output_activation_min,
   2059               output_activation_max, output_data, output_dims);
   2060 }
   2061 
   2062 inline void LocalResponseNormalization(const float* input_data,
   2063                                        const Dims<4>& input_dims, int range,
   2064                                        float bias, float alpha, float beta,
   2065                                        float* output_data,
   2066                                        const Dims<4>& output_dims) {
   2067   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2068   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2069   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2070   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2071 
   2072   for (int b = 0; b < batches; ++b) {
   2073     for (int y = 0; y < height; ++y) {
   2074       for (int x = 0; x < width; ++x) {
   2075         for (int c = 0; c < depth; ++c) {
   2076           const int begin_input_c = std::max(0, c - range);
   2077           const int end_input_c = std::min(depth, c + range);
   2078           float accum = 0.f;
   2079           for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
   2080             const float input_val =
   2081                 input_data[Offset(input_dims, input_c, x, y, b)];
   2082             accum += input_val * input_val;
   2083           }
   2084           const float multiplier = std::pow(bias + alpha * accum, -beta);
   2085           output_data[Offset(output_dims, c, x, y, b)] =
   2086               input_data[Offset(input_dims, c, x, y, b)] * multiplier;
   2087         }
   2088       }
   2089     }
   2090   }
   2091 }
   2092 
   2093 inline void Softmax(const float* input_data, const Dims<4>& input_dims,
   2094                     float beta, float* output_data,
   2095                     const Dims<4>& output_dims) {
   2096   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2097   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2098   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2099   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2100 
   2101   for (int b = 0; b < batches; ++b) {
   2102     for (int y = 0; y < height; ++y) {
   2103       for (int x = 0; x < width; ++x) {
   2104         // Find max element value which we'll use to ensure numerical stability
   2105         // taking advantage of the following equality:
   2106         // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
   2107         float max = std::numeric_limits<float>::lowest();
   2108         for (int c = 0; c < depth; ++c) {
   2109           max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
   2110         }
   2111 
   2112         // Compute sum.
   2113         float sum = 0.f;
   2114         for (int c = 0; c < depth; ++c) {
   2115           sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
   2116                           beta);
   2117         }
   2118 
   2119         // Compute result.
   2120         for (int c = 0; c < depth; ++c) {
   2121           output_data[Offset(output_dims, c, x, y, b)] =
   2122               std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
   2123                        beta) /
   2124               sum;
   2125         }
   2126       }
   2127     }
   2128   }
   2129 }
   2130 
   2131 inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   2132                     int32 input_beta_multiplier, int32 input_beta_left_shift,
   2133                     int diff_min, uint8* output_data,
   2134                     const Dims<4>& output_dims) {
   2135   // The representation chosen for the input to the exp() function is Q5.26.
   2136   // We need to leave extra space since values that we skip might be as large as
   2137   // -32 before multiplying by input_beta_multiplier, and therefore as large as
   2138   // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
   2139   // accumulation, but exp(-16) definitely is.
   2140   static const int kScaledDiffIntegerBits = 5;
   2141   static const int kAccumulationIntegerBits = 12;
   2142   using FixedPointScaledDiff =
   2143       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   2144   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   2145   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
   2146 
   2147   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2148   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2149   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2150   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2151 
   2152   for (int b = 0; b < batches; ++b) {
   2153     for (int x = 0; x < width; ++x) {
   2154       for (int y = 0; y < height; ++y) {
   2155         uint8 max_in_row = 0;
   2156         for (int c = 0; c < depth; ++c) {
   2157           max_in_row =
   2158               std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
   2159         }
   2160 
   2161         FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
   2162         for (int c = 0; c < depth; ++c) {
   2163           int32 input_diff =
   2164               static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
   2165               max_in_row;
   2166           if (input_diff >= diff_min) {
   2167             const int32 input_diff_rescaled =
   2168                 MultiplyByQuantizedMultiplierGreaterThanOne(
   2169                     input_diff, input_beta_multiplier, input_beta_left_shift);
   2170             const FixedPointScaledDiff scaled_diff_f8 =
   2171                 FixedPointScaledDiff::FromRaw(input_diff_rescaled);
   2172             sum_of_exps =
   2173                 sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
   2174                                   exp_on_negative_values(scaled_diff_f8));
   2175           }
   2176         }
   2177 
   2178         int32 fixed_sum_of_exps = sum_of_exps.raw();
   2179         int headroom_plus_one =
   2180             CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
   2181         // This is the number of bits to the left of the binary point above 1.0.
   2182         // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
   2183         // no later adjustment will be needed.
   2184         int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
   2185         int32 shifted_sum_minus_one = static_cast<int32>(
   2186             (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
   2187             (static_cast<uint32>(1) << 31));
   2188 
   2189         FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
   2190             FixedPoint0::FromRaw(shifted_sum_minus_one));
   2191 
   2192         for (int c = 0; c < depth; ++c) {
   2193           int32 input_diff =
   2194               static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
   2195               max_in_row;
   2196           if (input_diff >= diff_min) {
   2197             const int32 input_diff_rescaled =
   2198                 MultiplyByQuantizedMultiplierGreaterThanOne(
   2199                     input_diff, input_beta_multiplier, input_beta_left_shift);
   2200             const FixedPointScaledDiff scaled_diff_f8 =
   2201                 FixedPointScaledDiff::FromRaw(input_diff_rescaled);
   2202 
   2203             FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
   2204             int32 unsat_output = gemmlowp::RoundingDivideByPOT(
   2205                 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
   2206 
   2207             output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
   2208                 std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
   2209 
   2210           } else {
   2211             output_data[Offset(output_dims, c, x, y, b)] = 0;
   2212           }
   2213         }
   2214       }
   2215     }
   2216   }
   2217 }
   2218 
   2219 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
   2220                      float* output_data, const Dims<4>& output_dims) {
   2221   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2222   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2223   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2224   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2225   for (int b = 0; b < batches; ++b) {
   2226     for (int y = 0; y < height; ++y) {
   2227       for (int x = 0; x < width; ++x) {
   2228         for (int c = 0; c < depth; ++c) {
   2229           float val = input_data[Offset(input_dims, c, x, y, b)];
   2230           float result = 1.f / (1.f + std::exp(-val));
   2231           output_data[Offset(output_dims, c, x, y, b)] = result;
   2232         }
   2233       }
   2234     }
   2235   }
   2236 }
   2237 
   2238 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
   2239                      int32 input_zero_point, int32 input_range_radius,
   2240                      int32 input_multiplier, int input_left_shift,
   2241                      uint8* output_data, const Dims<4>& output_dims) {
   2242   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2243   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2244   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2245   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2246   for (int b = 0; b < batches; ++b) {
   2247     for (int y = 0; y < height; ++y) {
   2248       for (int x = 0; x < width; ++x) {
   2249         for (int c = 0; c < depth; ++c) {
   2250           const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
   2251           const int32 input_val_centered =
   2252               static_cast<int32>(input_val_u8) - input_zero_point;
   2253           uint8 output_val;
   2254           if (input_val_centered <= -input_range_radius) {
   2255             output_val = 0;
   2256           } else if (input_val_centered >= input_range_radius) {
   2257             output_val = 255;
   2258           } else {
   2259             const int32 input_val_rescaled =
   2260                 MultiplyByQuantizedMultiplierGreaterThanOne(
   2261                     input_val_centered, input_multiplier, input_left_shift);
   2262             using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
   2263             using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
   2264             const FixedPoint4 input_val_f4 =
   2265                 FixedPoint4::FromRaw(input_val_rescaled);
   2266             const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
   2267             using gemmlowp::RoundingDivideByPOT;
   2268             int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
   2269             if (output_val_s32 == 256) {
   2270               output_val_s32 = 255;
   2271             }
   2272             TFLITE_DCHECK_GE(output_val_s32, 0);
   2273             TFLITE_DCHECK_LE(output_val_s32, 255);
   2274             output_val = static_cast<uint8>(output_val_s32);
   2275           }
   2276           output_data[Offset(output_dims, c, x, y, b)] = output_val;
   2277         }
   2278       }
   2279     }
   2280   }
   2281 }
   2282 
   2283 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
   2284                  float* output_data, const Dims<4>& output_dims) {
   2285   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2286   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2287   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2288   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2289   for (int b = 0; b < batches; ++b) {
   2290     for (int y = 0; y < height; ++y) {
   2291       for (int x = 0; x < width; ++x) {
   2292         for (int c = 0; c < depth; ++c) {
   2293           float val = input_data[Offset(input_dims, c, x, y, b)];
   2294           float result = std::tanh(val);
   2295           output_data[Offset(output_dims, c, x, y, b)] = result;
   2296         }
   2297       }
   2298     }
   2299   }
   2300 }
   2301 
   2302 inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
   2303                  int32 input_zero_point, int32 input_range_radius,
   2304                  int32 input_multiplier, int input_left_shift,
   2305                  uint8* output_data, const Dims<4>& output_dims) {
   2306   const int32 output_zero_point = 128;
   2307   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2308   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2309   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2310   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2311   for (int b = 0; b < batches; ++b) {
   2312     for (int y = 0; y < height; ++y) {
   2313       for (int x = 0; x < width; ++x) {
   2314         for (int c = 0; c < depth; ++c) {
   2315           const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
   2316           const int32 input_val_centered =
   2317               static_cast<int32>(input_val_u8) - input_zero_point;
   2318           uint8 output_val;
   2319           if (input_val_centered <= -input_range_radius) {
   2320             output_val = 0;
   2321           } else if (input_val_centered >= input_range_radius) {
   2322             output_val = 255;
   2323           } else {
   2324             const int32 input_val_rescaled =
   2325                 MultiplyByQuantizedMultiplierGreaterThanOne(
   2326                     input_val_centered, input_multiplier, input_left_shift);
   2327             using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
   2328             using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
   2329             const FixedPoint4 input_val_f4 =
   2330                 FixedPoint4::FromRaw(input_val_rescaled);
   2331             const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
   2332 
   2333             using gemmlowp::RoundingDivideByPOT;
   2334             int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
   2335             output_val_s32 += output_zero_point;
   2336             if (output_val_s32 == 256) {
   2337               output_val_s32 = 255;
   2338             }
   2339             TFLITE_DCHECK_GE(output_val_s32, 0);
   2340             TFLITE_DCHECK_LE(output_val_s32, 255);
   2341             output_val = static_cast<uint8>(output_val_s32);
   2342           }
   2343           output_data[Offset(output_dims, c, x, y, b)] = output_val;
   2344         }
   2345       }
   2346     }
   2347   }
   2348 }
   2349 
   2350 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
   2351                        int32 zero_point, double scale, float* output_data,
   2352                        const Dims<4>& output_dims) {
   2353   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2354   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2355   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2356   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2357   for (int b = 0; b < batches; ++b) {
   2358     for (int y = 0; y < height; ++y) {
   2359       for (int x = 0; x < width; ++x) {
   2360         for (int c = 0; c < depth; ++c) {
   2361           int32 val = input_data[Offset(input_dims, c, x, y, b)];
   2362           float result = static_cast<float>(scale * (val - zero_point));
   2363           output_data[Offset(output_dims, c, x, y, b)] = result;
   2364         }
   2365       }
   2366     }
   2367   }
   2368 }
   2369 
   2370 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
   2371                       float rmin, float rmax, float* output_data,
   2372                       const Dims<4>& output_dims) {
   2373   // 0 should always be a representable value. Let's assume that the initial
   2374   // min,max range contains 0.
   2375   TFLITE_DCHECK_LE(rmin, 0.);
   2376   TFLITE_DCHECK_GE(rmax, 0.);
   2377 
   2378   // Determine quantization parameters: zero_point, scale.
   2379   using Integer = uint8;
   2380   const Integer qmin = std::numeric_limits<Integer>::min();
   2381   const Integer qmax = std::numeric_limits<Integer>::max();
   2382   const float qmin_float = qmin;
   2383   const float qmax_float = qmax;
   2384   int32 zero_point = 0;
   2385   float scale = 0.f;
   2386   // If rmin==rmax, both must be zero per the above assertion,
   2387   // so we are done.
   2388   if (rmin != rmax) {
   2389     // First determine the scale.
   2390     scale = (rmax - rmin) / (qmax_float - qmin_float);
   2391 
   2392     // Zero-point computation.
   2393     // First the initial floating-point computation. The zero-point can be
   2394     // determined from solving an affine equation for any known pair
   2395     // (real value, corresponding quantized value).
   2396     // We know two such pairs: (rmin, qmin) and (rmax, qmax).
   2397     // The arithmetic error on the zero point computed from either pair
   2398     // will be roughly machine_epsilon * (sum of absolute values of terms)
   2399     // so we want to use the variant that adds the smaller terms.
   2400     const float zero_point_from_min = qmin_float - rmin / scale;
   2401     const float zero_point_from_max = qmax_float - rmax / scale;
   2402     const float zero_point_from_min_error =
   2403         std::abs(qmin_float) + std::abs(rmin / scale);
   2404     const float zero_point_from_max_error =
   2405         std::abs(qmax_float) + std::abs(rmax / scale);
   2406 
   2407     const float zero_point_float =
   2408         zero_point_from_min_error < zero_point_from_max_error
   2409             ? zero_point_from_min
   2410             : zero_point_from_max;
   2411 
   2412     // Now we need to nudge the zero point to be an integer
   2413     // (our zero points are integer, and this is motivated by the requirement
   2414     // to be able to represent the real value "0" exactly as a quantized value,
   2415     // which is required in multiple places, for example in Im2col with SAME
   2416     // padding).
   2417     if (zero_point_float < qmin_float) {
   2418       zero_point = qmin;
   2419     } else if (zero_point_float > qmax_float) {
   2420       zero_point = qmax;
   2421     } else {
   2422       zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
   2423     }
   2424     // The zero point should always be in the range of quantized value,
   2425     // [qmin, qmax].
   2426     TFLITE_DCHECK_GE(zero_point, qmin);
   2427     TFLITE_DCHECK_LE(zero_point, qmax);
   2428   }
   2429 
   2430   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2431   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2432   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2433   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2434   for (int b = 0; b < batches; ++b) {
   2435     for (int y = 0; y < height; ++y) {
   2436       for (int x = 0; x < width; ++x) {
   2437         for (int c = 0; c < depth; ++c) {
   2438           const float src_val = input_data[Offset(input_dims, c, x, y, b)];
   2439           const float unclamped_quantized_val =
   2440               TfLiteRound(zero_point + src_val / scale);
   2441           const float quantized_val = std::min(
   2442               qmax_float, std::max(qmin_float, unclamped_quantized_val));
   2443           const float dst_val = scale * (quantized_val - zero_point);
   2444           output_data[Offset(output_dims, c, x, y, b)] = dst_val;
   2445         }
   2446       }
   2447     }
   2448   }
   2449 }
   2450 
   2451 template <typename SrcT, typename DstT>
   2452 inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
   2453                  DstT* output_data, const Dims<4>& output_dims) {
   2454   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2455   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2456   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2457   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2458   for (int b = 0; b < batches; ++b) {
   2459     for (int y = 0; y < height; ++y) {
   2460       for (int x = 0; x < width; ++x) {
   2461         for (int c = 0; c < depth; ++c) {
   2462           int offset = Offset(input_dims, c, x, y, b);
   2463           output_data[offset] = static_cast<DstT>(input_data[offset]);
   2464         }
   2465       }
   2466     }
   2467   }
   2468 }
   2469 
   2470 inline void Floor(const float* input_data, const Dims<4>& input_dims,
   2471                   float* output_data, const Dims<4>& output_dims) {
   2472   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2473   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2474   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2475   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2476   for (int b = 0; b < batches; ++b) {
   2477     for (int y = 0; y < height; ++y) {
   2478       for (int x = 0; x < width; ++x) {
   2479         for (int c = 0; c < depth; ++c) {
   2480           int offset = Offset(input_dims, c, x, y, b);
   2481           output_data[offset] = std::floor(input_data[offset]);
   2482         }
   2483       }
   2484     }
   2485   }
   2486 }
   2487 
   2488 template <typename T>
   2489 inline void Gather(const T* input_data, const Dims<4>& input_dims,
   2490                    int input_rank, const int32* coords_data,
   2491                    const Dims<4>& coords_dims, T* output_data,
   2492                    const Dims<4>& output_dims) {
   2493   TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
   2494   int stride = input_dims.strides[input_rank - 1];
   2495   T* out = output_data;
   2496 
   2497   for (int i = 0; i < coords_dims.sizes[0]; i++) {
   2498     TFLITE_DCHECK_GE(coords_data[i], 0);
   2499     TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
   2500     const T* in = input_data + coords_data[i] * stride;
   2501     memcpy(out, in, sizeof(T) * stride);
   2502     out += stride;
   2503   }
   2504 }
   2505 
   2506 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   2507                            const int32* output_size_data,
   2508                            const Dims<4>& output_size_dims, float* output_data,
   2509                            const Dims<4>& output_dims, bool align_corners) {
   2510   int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2511   int32 input_height = ArraySize(input_dims, 2);
   2512   int32 input_width = ArraySize(input_dims, 1);
   2513   int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   2514 
   2515   TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
   2516   TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
   2517   TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
   2518   TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
   2519   int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
   2520   int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
   2521   float height_scale = static_cast<float>(input_height) / output_height;
   2522   float width_scale = static_cast<float>(input_width) / output_width;
   2523   if (align_corners && output_height > 1) {
   2524     height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
   2525   }
   2526   if (align_corners && output_width > 1) {
   2527     width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
   2528   }
   2529 
   2530   for (int b = 0; b < batches; ++b) {
   2531     for (int y = 0; y < output_height; ++y) {
   2532       float input_y = y * height_scale;
   2533       int32 y0 = static_cast<int32>(std::floor(input_y));
   2534       int32 y1 = std::min(y0 + 1, input_height - 1);
   2535       for (int x = 0; x < output_width; ++x) {
   2536         float input_x = x * width_scale;
   2537         int32 x0 = static_cast<int32>(std::floor(input_x));
   2538         int32 x1 = std::min(x0 + 1, input_width - 1);
   2539         for (int c = 0; c < depth; ++c) {
   2540           float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
   2541                                     (1 - (input_y - y0)) *
   2542                                     (1 - (input_x - x0)) +
   2543                                 input_data[Offset(input_dims, c, x0, y1, b)] *
   2544                                     (input_y - y0) * (1 - (input_x - x0)) +
   2545                                 input_data[Offset(input_dims, c, x1, y0, b)] *
   2546                                     (1 - (input_y - y0)) * (input_x - x0) +
   2547                                 input_data[Offset(input_dims, c, x1, y1, b)] *
   2548                                     (input_y - y0) * (input_x - x0);
   2549           output_data[Offset(output_dims, c, x, y, b)] = interpolation;
   2550         }
   2551       }
   2552     }
   2553   }
   2554 }
   2555 
   2556 // legacy, for compatibility with old checked-in code
   2557 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   2558                            const int32* output_size_data,
   2559                            const Dims<4>& output_size_dims, float* output_data,
   2560                            const Dims<4>& output_dims) {
   2561   ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
   2562                  output_data, output_dims, /*align_corners=*/false);
   2563 }
   2564 
   2565 template <typename T>
   2566 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
   2567                            const int32* block_shape_data,
   2568                            const Dims<4>& block_shape_dims,
   2569                            const int32* paddings_data,
   2570                            const Dims<4>& paddings_dims, T* output_data,
   2571                            const Dims<4>& output_dims) {
   2572   const int output_batch_size = ArraySize(output_dims, 3);
   2573   const int output_height = ArraySize(output_dims, 2);
   2574   const int output_width = ArraySize(output_dims, 1);
   2575   const int input_batch_size = ArraySize(input_dims, 3);
   2576   const int input_height = ArraySize(input_dims, 2);
   2577   const int input_width = ArraySize(input_dims, 1);
   2578   const int depth = ArraySize(input_dims, 0);
   2579   const int block_shape_height = block_shape_data[0];
   2580   const int block_shape_width = block_shape_data[1];
   2581   const int padding_top = paddings_data[0];
   2582   const int padding_left = paddings_data[2];
   2583 
   2584   for (int out_b = 0; out_b < output_batch_size; ++out_b) {
   2585     int input_batch = out_b % input_batch_size;
   2586     int shift_w = (out_b / input_batch_size) % block_shape_width;
   2587     int shift_h = (out_b / input_batch_size) / block_shape_width;
   2588     for (int out_h = 0; out_h < output_height; ++out_h) {
   2589       for (int out_w = 0; out_w < output_width; ++out_w) {
   2590         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
   2591         if (out_h * block_shape_height + shift_h < padding_top ||
   2592             out_h * block_shape_height + shift_h >=
   2593                 padding_top + input_height ||
   2594             out_w * block_shape_width + shift_w < padding_left ||
   2595             out_w * block_shape_width + shift_w >= padding_left + input_width) {
   2596           memset(out, 0, depth * sizeof(T));
   2597         } else {
   2598           const T* in =
   2599               input_data +
   2600               Offset(input_dims, 0,
   2601                      (out_w * block_shape_width + shift_w) - padding_left,
   2602                      (out_h * block_shape_height + shift_h) - padding_top,
   2603                      input_batch);
   2604           memcpy(out, in, depth * sizeof(T));
   2605         }
   2606       }
   2607     }
   2608   }
   2609 }
   2610 
   2611 template <typename T>
   2612 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
   2613                            const int32* block_shape_data,
   2614                            const Dims<4>& block_shape_dims, T* output_data,
   2615                            const Dims<4>& output_dims) {
   2616   const int output_batch_size = ArraySize(output_dims, 3);
   2617   const int input_batch_size = ArraySize(input_dims, 3);
   2618   const int input_height = ArraySize(input_dims, 2);
   2619   const int input_width = ArraySize(input_dims, 1);
   2620   const int depth = ArraySize(input_dims, 0);
   2621   const int block_shape_width = block_shape_data[1];
   2622   const int block_shape_height = block_shape_data[0];
   2623 
   2624   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
   2625     for (int in_h = 0; in_h < input_height; ++in_h) {
   2626       for (int in_w = 0; in_w < input_width; ++in_w) {
   2627         int out_batch = in_batch % output_batch_size;
   2628         int out_w = in_w * block_shape_width +
   2629                     (in_batch / output_batch_size) % block_shape_width;
   2630         int out_h = in_h * block_shape_height +
   2631                     (in_batch / output_batch_size) / block_shape_width;
   2632         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
   2633         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
   2634         memcpy(out, in, depth * sizeof(T));
   2635       }
   2636     }
   2637   }
   2638 }
   2639 
   2640 template <typename T>
   2641 inline void Pad(const T* input_data, const Dims<4>& input_dims,
   2642                 const std::vector<int>& left_paddings,
   2643                 const std::vector<int>& right_paddings, T* output_data,
   2644                 const Dims<4>& output_dims) {
   2645   const int output_batch = ArraySize(output_dims, 3);
   2646   const int output_height = ArraySize(output_dims, 2);
   2647   const int output_width = ArraySize(output_dims, 1);
   2648   const int output_depth = ArraySize(output_dims, 0);
   2649 
   2650   const int left_b_padding = left_paddings[3];
   2651   const int left_h_padding = left_paddings[2];
   2652   const int left_w_padding = left_paddings[1];
   2653   const int left_d_padding = left_paddings[0];
   2654 
   2655   const int right_b_padding = right_paddings[3];
   2656   const int right_h_padding = right_paddings[2];
   2657   const int right_w_padding = right_paddings[1];
   2658   const int right_d_padding = right_paddings[0];
   2659 
   2660   const T* in_ptr = input_data;
   2661   T* out_ptr = output_data;
   2662   for (int out_b = 0; out_b < output_batch; ++out_b) {
   2663     for (int out_h = 0; out_h < output_height; ++out_h) {
   2664       for (int out_w = 0; out_w < output_width; ++out_w) {
   2665         for (int out_d = 0; out_d < output_depth; ++out_d) {
   2666           if (out_b < left_b_padding ||
   2667               out_b >= output_batch - right_b_padding ||
   2668               out_h < left_h_padding ||
   2669               out_h >= output_height - right_h_padding ||
   2670               out_w < left_w_padding ||
   2671               out_w >= output_width - right_w_padding ||
   2672               out_d < left_d_padding ||
   2673               out_d >= output_depth - right_d_padding) {
   2674             *out_ptr++ = 0;
   2675           } else {
   2676             *out_ptr++ = *in_ptr++;
   2677           }
   2678         }
   2679       }
   2680     }
   2681   }
   2682 }
   2683 
   2684 inline bool LoopCondition(int index, int stop, int stride) {
   2685   return stride > 0 ? index < stop : index > stop;
   2686 }
   2687 
   2688 inline int StartIndex(int start, int stride, int dim, bool masked) {
   2689   return masked ? (stride > 0 ? 0 : dim - 1) : start;
   2690 }
   2691 
   2692 inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
   2693                      bool shrink_axis_masked) {
   2694   return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
   2695                             : masked ? (stride > 0 ? dim : -1) : stop;
   2696 }
   2697 
   2698 template <typename T>
   2699 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   2700                          int begin_mask, int end_mask, int shrink_axis_mask,
   2701                          const std::vector<int>& starts,
   2702                          const std::vector<int>& stops,
   2703                          const std::vector<int>& strides, T* output_data,
   2704                          const Dims<4>& output_dims) {
   2705   TFLITE_DCHECK_EQ(starts.size(), 4);
   2706   TFLITE_DCHECK_EQ(stops.size(), 4);
   2707   TFLITE_DCHECK_EQ(strides.size(), 4);
   2708   const int start_b =
   2709       StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
   2710   const int stop_b =
   2711       StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
   2712                 end_mask & 8, shrink_axis_mask & 8);
   2713   const int start_h =
   2714       StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
   2715   const int stop_h =
   2716       StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
   2717                 end_mask & 4, shrink_axis_mask & 4);
   2718   const int start_w =
   2719       StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
   2720   const int stop_w =
   2721       StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
   2722                 end_mask & 2, shrink_axis_mask & 2);
   2723   const int start_d =
   2724       StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
   2725   const int stop_d =
   2726       StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
   2727                 end_mask & 1, shrink_axis_mask & 1);
   2728 
   2729   T* out_ptr = output_data;
   2730   for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
   2731        in_b += strides[3]) {
   2732     for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
   2733          in_h += strides[2]) {
   2734       for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
   2735            in_w += strides[1]) {
   2736         for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
   2737              in_d += strides[0]) {
   2738           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
   2739         }
   2740       }
   2741     }
   2742   }
   2743 }
   2744 
   2745 template <typename T>
   2746 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   2747                          int begin_mask, int end_mask,
   2748                          const std::vector<int>& starts,
   2749                          const std::vector<int>& stops,
   2750                          const std::vector<int>& strides, T* output_data,
   2751                          const Dims<4>& output_dims) {
   2752   StridedSlice(input_data, input_dims, begin_mask, end_mask,
   2753                /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
   2754                output_dims);
   2755 }
   2756 
   2757 template <typename T>
   2758 inline void Slice(const T* input_data, const Dims<4>& input_dims,
   2759                   const std::vector<int>& begin, const std::vector<int>& size,
   2760                   T* output_data, const Dims<4>& output_dims) {
   2761   // TODO(dkalenichenko): This op only supports 4D tensors.
   2762   TFLITE_DCHECK_EQ(begin.size(), 4);
   2763   TFLITE_DCHECK_EQ(size.size(), 4);
   2764   const int start_b = begin[3];
   2765   const int stop_b =
   2766       size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
   2767   const int start_h = begin[2];
   2768   const int stop_h =
   2769       size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
   2770   const int start_w = begin[1];
   2771   const int stop_w =
   2772       size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
   2773   const int start_d = begin[0];
   2774   const int stop_d =
   2775       size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
   2776 
   2777   T* out_ptr = output_data;
   2778   for (int in_b = start_b; in_b < stop_b; ++in_b) {
   2779     for (int in_h = start_h; in_h < stop_h; ++in_h) {
   2780       for (int in_w = start_w; in_w < stop_w; ++in_w) {
   2781         for (int in_d = start_d; in_d < stop_d; ++in_d) {
   2782           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
   2783         }
   2784       }
   2785     }
   2786   }
   2787 }
   2788 
   2789 template <typename T>
   2790 inline void Exp(const T* input_data, const size_t num_elements,
   2791                 T* output_data) {
   2792   for (size_t idx = 0; idx < num_elements; ++idx) {
   2793     output_data[idx] = exp(input_data[idx]);
   2794   }
   2795 }
   2796 
   2797 template <typename T>
   2798 inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
   2799                  T* output_data, const int* output_dims,
   2800                  const int output_num_dims, const int* axis,
   2801                  const int num_axis_dimensions, bool keep_dims, int* temp_index,
   2802                  int* resolved_axis) {
   2803   // resets output data.
   2804   size_t num_outputs = 1;
   2805   for (int idx = 0; idx < output_num_dims; ++idx) {
   2806     num_outputs *= static_cast<size_t>(output_dims[idx]);
   2807   }
   2808   for (size_t idx = 0; idx < num_outputs; ++idx) {
   2809     output_data[idx] = 0;
   2810   }
   2811   // resets temp index.
   2812   for (int idx = 0; idx < input_num_dims; ++idx) {
   2813     temp_index[idx] = 0;
   2814   }
   2815   // resolves axis.
   2816   int num_resolved_axis = 0;
   2817   for (int idx = 0; idx < num_axis_dimensions; ++idx) {
   2818     int current = axis[idx];
   2819     TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0);
   2820     if (current < 0) {
   2821       current += input_num_dims;
   2822     }
   2823     bool is_dup = false;
   2824     for (int j = 0; j < num_resolved_axis; ++j) {
   2825       if (resolved_axis[j] == current) {
   2826         is_dup = true;
   2827         break;
   2828       }
   2829     }
   2830     if (!is_dup) {
   2831       resolved_axis[num_resolved_axis++] = current;
   2832     }
   2833   }
   2834   // iterates through input_data.
   2835   for (bool has_next = true; has_next;
   2836        has_next = NextIndex(input_num_dims, input_dims, temp_index)) {
   2837     size_t input_offset =
   2838         ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
   2839     size_t output_offset =
   2840         ReducedOutputOffset(input_num_dims, input_dims, temp_index,
   2841                             num_resolved_axis, resolved_axis);
   2842     output_data[output_offset] += input_data[input_offset];
   2843   }
   2844   // takes average by num of elements added to get mean.
   2845   size_t num_elements_in_axis = 1;
   2846   for (int idx = 0; idx < num_resolved_axis; ++idx) {
   2847     num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]);
   2848   }
   2849   for (size_t idx = 0; idx < num_outputs; ++idx) {
   2850     output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) /
   2851                                       num_elements_in_axis);
   2852   }
   2853 }
   2854 
   2855 template <typename T>
   2856 inline void Mean(const T* input_data, const Dims<4>& input_dims,
   2857                  const std::vector<int>& reduction_indices, T* output_data,
   2858                  const Dims<4>& output_dims) {
   2859   const int output_batch = ArraySize(output_dims, 3);
   2860   const int output_height = ArraySize(output_dims, 2);
   2861   const int output_width = ArraySize(output_dims, 1);
   2862   const int output_depth = ArraySize(output_dims, 0);
   2863 
   2864   const int input_height = ArraySize(input_dims, 2);
   2865   const int input_width = ArraySize(input_dims, 1);
   2866 
   2867   // The current implementation only supports simultaneous reduction over
   2868   // width and height.
   2869   TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
   2870   TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
   2871                 (reduction_indices[0] == 2 && reduction_indices[1] == 1));
   2872   TFLITE_DCHECK_EQ(output_height, 1);
   2873   TFLITE_DCHECK_EQ(output_width, 1);
   2874 
   2875   for (int out_b = 0; out_b < output_batch; ++out_b) {
   2876     for (int out_d = 0; out_d < output_depth; ++out_d) {
   2877       float value = 0;
   2878       for (int in_h = 0; in_h < input_height; ++in_h) {
   2879         for (int in_w = 0; in_w < input_width; ++in_w) {
   2880           value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
   2881         }
   2882       }
   2883       output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
   2884           value / (input_width * input_height);
   2885     }
   2886   }
   2887 }
   2888 
   2889 template <typename T>
   2890 void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
   2891          const Dims<4>& input2_dims, T* output_data,
   2892          const Dims<4>& output_dims) {
   2893   NdArrayDesc<4> desc1;
   2894   NdArrayDesc<4> desc2;
   2895   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
   2896 
   2897   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   2898   // col, channel), with extents (batches, height, width, depth), with the
   2899   // trailing dimension changing most rapidly (channels has the smallest stride,
   2900   // typically 1 element).
   2901   //
   2902   // In generated C code, we store arrays with the dimensions reversed. The
   2903   // first dimension has smallest stride.
   2904   //
   2905   // We name our variables by their Tensorflow convention, but generate C code
   2906   // nesting loops such that the innermost loop has the smallest stride for the
   2907   // best cache behavior.
   2908   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
   2909     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
   2910       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
   2911         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
   2912           output_data[Offset(output_dims, c, x, y, b)] =
   2913               input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
   2914               input2_data[SubscriptToIndex(desc2, c, x, y, b)];
   2915         }
   2916       }
   2917     }
   2918   }
   2919 }
   2920 
   2921 template <typename T>
   2922 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
   2923                        const T* input2_data, T* output_data,
   2924                        const Dims<4>& output_dims) {
   2925   int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
   2926   int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
   2927   int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
   2928   int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
   2929 
   2930   auto min_value = input2_data[0];
   2931 
   2932   for (int b = 0; b < batches; b++) {
   2933     for (int y = 0; y < input_height; y++) {
   2934       for (int x = 0; x < input_width; x++) {
   2935         for (int c = 0; c < depth; c++) {
   2936           int offset = Offset(input1_dims, c, x, y, b);
   2937           output_data[offset] =
   2938               input1_data[offset] > min_value ? min_value : input1_data[offset];
   2939         }
   2940       }
   2941     }
   2942   }
   2943 }
   2944 
   2945 template <typename T>
   2946 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   2947                        const T* input2_data, T* output_data,
   2948                        const Dims<4>& output_dims) {
   2949   int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
   2950   int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
   2951   int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
   2952   int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
   2953 
   2954   auto max_value = input2_data[0];
   2955 
   2956   for (int b = 0; b < batches; b++) {
   2957     for (int y = 0; y < input_height; y++) {
   2958       for (int x = 0; x < input_width; x++) {
   2959         for (int c = 0; c < depth; c++) {
   2960           int offset = Offset(input1_dims, c, x, y, b);
   2961           output_data[offset] =
   2962               input1_data[offset] < max_value ? max_value : input1_data[offset];
   2963         }
   2964       }
   2965     }
   2966   }
   2967 }
   2968 
   2969 template <typename T1, typename T2, typename T3>
   2970 void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   2971             T2* output_data, const Dims<4>& output_dims) {
   2972   // The current ArgMax implemention can only determine the index of the maximum
   2973   // value in the last dimension. So the axis argument is ignored.
   2974   TFLITE_DCHECK_EQ(axis[0], 3);
   2975 
   2976   // For ArgMax, the number of output dimensions = (number of input dimensions -
   2977   // 1). For the sake of simplicity, the output dimensions are equal to the
   2978   // input dimensions here. We enforce the constraint that the last dimension
   2979   // must always be 1.
   2980   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
   2981   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   2982   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   2983   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   2984   const int depth = ArraySize(input_dims, 0);
   2985   for (int b = 0; b < batches; ++b) {
   2986     for (int y = 0; y < height; ++y) {
   2987       for (int x = 0; x < width; ++x) {
   2988         auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
   2989         int max_index = 0;
   2990         for (int d = 1; d < depth; ++d) {
   2991           const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
   2992           if (curr_value > max_value) {
   2993             max_value = curr_value;
   2994             max_index = d;
   2995           }
   2996         }
   2997         output_data[Offset(output_dims, 0, x, y, b)] = max_index;
   2998       }
   2999     }
   3000   }
   3001 }
   3002 
   3003 template <typename T>
   3004 void Transpose(const T* input, const Dims<4>& input_dims, T* output,
   3005                const Dims<4>& output_dims, int* permuted_axes) {
   3006   int out_sizes[4];
   3007   // Compute the inverse permutation array so we can do an output centered
   3008   // transpose. Also, check to make sure output_dims is matching input_dims.
   3009   for (int k = 0; k < 4; k++) {
   3010     out_sizes[k] =
   3011         MatchingArraySize(input_dims, permuted_axes[k], output_dims, k);
   3012   }
   3013 
   3014   // Naive transpose loop (iterate on output index and compute input index).
   3015   int o[4];  // loop index (on output).
   3016   int i[4];
   3017   for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) {
   3018     i[permuted_axes[3]] = o[3];
   3019     for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) {
   3020       i[permuted_axes[2]] = o[2];
   3021       for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) {
   3022         i[permuted_axes[1]] = o[1];
   3023         for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) {
   3024           i[permuted_axes[0]] = o[0];
   3025           output[Offset(output_dims, o)] = input[Offset(input_dims, i)];
   3026         }
   3027       }
   3028     }
   3029   }
   3030 }
   3031 
   3032 }  // namespace reference_ops
   3033 }  // namespace tflite
   3034 
   3035 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
   3036