1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 16 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 17 18 #include <stdint.h> 19 #include <sys/types.h> 20 #include <algorithm> 21 #include <cmath> 22 #include <limits> 23 #include <memory> 24 #include <type_traits> 25 26 #include "Eigen/Core" 27 #include "fixedpoint/fixedpoint.h" 28 #include "public/gemmlowp.h" 29 #include "tensorflow/contrib/lite/kernels/internal/common.h" 30 #include "tensorflow/contrib/lite/kernels/internal/round.h" 31 #include "tensorflow/contrib/lite/kernels/internal/types.h" 32 33 namespace tflite { 34 namespace reference_ops { 35 36 inline int32 MultiplyByQuantizedMultiplierSmallerThanOne( 37 int32 x, int32 quantized_multiplier, int right_shift) { 38 using gemmlowp::RoundingDivideByPOT; 39 using gemmlowp::SaturatingRoundingDoublingHighMul; 40 return RoundingDivideByPOT( 41 SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); 42 } 43 44 inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( 45 int32 x, int32 quantized_multiplier, int left_shift) { 46 using gemmlowp::SaturatingRoundingDoublingHighMul; 47 return SaturatingRoundingDoublingHighMul(x * (1 << left_shift), 48 quantized_multiplier); 49 } 50 51 template <typename T> 52 int CountLeadingZeros(T integer_input) { 53 static_assert(std::is_unsigned<T>::value, 54 "Only unsigned integer types handled."); 55 const T one_in_leading_positive = static_cast<T>(1) 56 << (std::numeric_limits<T>::digits - 1); 57 int leading_zeros = 0; 58 while (integer_input < one_in_leading_positive) { 59 integer_input <<= 1; 60 ++leading_zeros; 61 } 62 return leading_zeros; 63 } 64 65 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE 66 // BROADCASTING. 67 // 68 // NdArrayDesc<N> describes the shape and memory layout of an N-dimensional 69 // rectangular array of numbers. 70 // 71 // NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. 72 // However, as Dims<N> is to be deprecated, this class exists as an adaptor 73 // to enable simple unoptimized implementations of element-wise broadcasting 74 // operations. 75 template <int N> 76 struct NdArrayDesc { 77 // The "extent" of each dimension. Indices along dimension d must be in the 78 // half-open interval [0, extents[d]). 79 int extents[N]; 80 81 // The number of *elements* (not bytes) between consecutive indices of each 82 // dimension. 83 int strides[N]; 84 }; 85 86 // DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING 87 // ELEMENT-WISE BROADCASTING. 88 // 89 // Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. 90 inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, 91 int i3) { 92 TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); 93 TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); 94 TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); 95 TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); 96 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + 97 i3 * desc.strides[3]; 98 } 99 100 // Given the dimensions of the operands for an element-wise binary broadcast, 101 // adjusts them so that they can be directly iterated over with simple loops. 102 // Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and 103 // 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. 104 // 105 // This function assumes that the two input shapes are compatible up to 106 // broadcasting and the shorter one has already been prepended with 1s to be the 107 // same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), 108 // shape1 must already have been prepended to be (1, 1, 1, 64). Recall that 109 // Dims<N> refer to shapes in reverse order. In this case, input0_dims will be 110 // (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). 111 // 112 // When two shapes are compatible up to broadcasting, for each dimension d, 113 // the input extents are either equal, or one of them is 1. 114 // 115 // This function performs the following for each dimension d: 116 // - If the extents are equal, then do nothing since the loop that walks over 117 // both of the input arrays is correct. 118 // - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 119 // and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows 120 // array0 to be referenced *at any index* in dimension d and still access the 121 // same slice. 122 template <int N> 123 inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, 124 const Dims<N>& input1_dims, 125 NdArrayDesc<N>* desc0_out, 126 NdArrayDesc<N>* desc1_out) { 127 TFLITE_DCHECK(desc0_out != nullptr); 128 TFLITE_DCHECK(desc1_out != nullptr); 129 130 // Copy dims to desc. 131 for (int i = 0; i < N; ++i) { 132 desc0_out->extents[i] = input0_dims.sizes[i]; 133 desc0_out->strides[i] = input0_dims.strides[i]; 134 desc1_out->extents[i] = input1_dims.sizes[i]; 135 desc1_out->strides[i] = input1_dims.strides[i]; 136 } 137 138 // Walk over each dimension. If the extents are equal do nothing. 139 // Otherwise, set the desc with extent 1 to have extent equal to the other and 140 // stride 0. 141 for (int i = 0; i < N; ++i) { 142 const int extent0 = ArraySize(input0_dims, i); 143 const int extent1 = ArraySize(input1_dims, i); 144 if (extent0 != extent1) { 145 if (extent0 == 1) { 146 desc0_out->strides[i] = 0; 147 desc0_out->extents[i] = extent1; 148 } else { 149 TFLITE_DCHECK_EQ(extent1, 1); 150 desc1_out->strides[i] = 0; 151 desc1_out->extents[i] = extent0; 152 } 153 } 154 } 155 } 156 157 inline void Conv(const float* input_data, const Dims<4>& input_dims, 158 const float* filter_data, const Dims<4>& filter_dims, 159 const float* bias_data, const Dims<4>& bias_dims, 160 int stride_width, int stride_height, int pad_width, 161 int pad_height, float output_activation_min, 162 float output_activation_max, float* output_data, 163 const Dims<4>& output_dims, float* im2col_data, 164 const Dims<4>& im2col_dims) { 165 (void)im2col_data; // only used in optimized code. 166 (void)im2col_dims; // only used in optimized code. 167 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 168 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 169 const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); 170 if (bias_data) { 171 TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0)); 172 } 173 const int input_height = ArraySize(input_dims, 2); 174 const int input_width = ArraySize(input_dims, 1); 175 const int filter_height = ArraySize(filter_dims, 2); 176 const int filter_width = ArraySize(filter_dims, 1); 177 const int output_height = ArraySize(output_dims, 2); 178 const int output_width = ArraySize(output_dims, 1); 179 for (int batch = 0; batch < batches; ++batch) { 180 for (int out_y = 0; out_y < output_height; ++out_y) { 181 for (int out_x = 0; out_x < output_width; ++out_x) { 182 for (int out_channel = 0; out_channel < output_depth; ++out_channel) { 183 const int in_x_origin = (out_x * stride_width) - pad_width; 184 const int in_y_origin = (out_y * stride_height) - pad_height; 185 float total = 0.f; 186 for (int filter_y = 0; filter_y < filter_height; ++filter_y) { 187 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 188 for (int in_channel = 0; in_channel < input_depth; ++in_channel) { 189 const int in_x = in_x_origin + filter_x; 190 const int in_y = in_y_origin + filter_y; 191 // If the location is outside the bounds of the input image, 192 // use zero as a default value. 193 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && 194 (in_y < input_height)) { 195 float input_value = input_data[Offset(input_dims, in_channel, 196 in_x, in_y, batch)]; 197 float filter_value = 198 filter_data[Offset(filter_dims, in_channel, filter_x, 199 filter_y, out_channel)]; 200 total += (input_value * filter_value); 201 } 202 } 203 } 204 } 205 float bias_value = 0.0f; 206 if (bias_data) { 207 bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; 208 } 209 output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = 210 ActivationFunctionWithMinMax(total + bias_value, 211 output_activation_min, 212 output_activation_max); 213 } 214 } 215 } 216 } 217 } 218 219 // legacy, for compatibility with old checked-in code 220 template <FusedActivationFunctionType Ac> 221 void Conv(const float* input_data, const Dims<4>& input_dims, 222 const float* filter_data, const Dims<4>& filter_dims, 223 const float* bias_data, const Dims<4>& bias_dims, int stride_width, 224 int stride_height, int pad_width, int pad_height, float* output_data, 225 const Dims<4>& output_dims, float* im2col_data, 226 const Dims<4>& im2col_dims) { 227 float output_activation_min, output_activation_max; 228 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 229 Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, 230 stride_width, stride_height, pad_width, pad_height, 231 output_activation_min, output_activation_max, output_data, output_dims, 232 im2col_data, im2col_dims); 233 } 234 235 // legacy, for compatibility with old checked-in code 236 template <FusedActivationFunctionType Ac> 237 void Conv(const float* input_data, const Dims<4>& input_dims, 238 const float* filter_data, const Dims<4>& filter_dims, 239 const float* bias_data, const Dims<4>& bias_dims, int stride, 240 int pad_width, int pad_height, float* output_data, 241 const Dims<4>& output_dims, float* im2col_data, 242 const Dims<4>& im2col_dims) { 243 Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, 244 bias_dims, stride, stride, pad_width, pad_height, output_data, 245 output_dims, im2col_data, im2col_dims); 246 } 247 248 inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 249 int32 input_offset, const uint8* filter_data, 250 const Dims<4>& filter_dims, int32 filter_offset, 251 const int32* bias_data, const Dims<4>& bias_dims, 252 int stride_width, int stride_height, int pad_width, 253 int pad_height, int32 output_offset, int32 output_multiplier, 254 int output_shift, int32 output_activation_min, 255 int32 output_activation_max, uint8* output_data, 256 const Dims<4>& output_dims, uint8* im2col_data, 257 const Dims<4>& im2col_dims, 258 gemmlowp::GemmContext* gemm_context) { 259 (void)im2col_data; // only used in optimized code. 260 (void)im2col_dims; // only used in optimized code. 261 (void)gemm_context; // only used in optimized code. 262 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 263 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 264 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 265 const int output_depth = 266 MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0); 267 const int input_height = ArraySize(input_dims, 2); 268 const int input_width = ArraySize(input_dims, 1); 269 const int filter_height = ArraySize(filter_dims, 2); 270 const int filter_width = ArraySize(filter_dims, 1); 271 const int output_height = ArraySize(output_dims, 2); 272 const int output_width = ArraySize(output_dims, 1); 273 for (int batch = 0; batch < batches; ++batch) { 274 for (int out_y = 0; out_y < output_height; ++out_y) { 275 for (int out_x = 0; out_x < output_width; ++out_x) { 276 for (int out_channel = 0; out_channel < output_depth; ++out_channel) { 277 const int in_x_origin = (out_x * stride_width) - pad_width; 278 const int in_y_origin = (out_y * stride_height) - pad_height; 279 int32 acc = 0; 280 for (int filter_y = 0; filter_y < filter_height; ++filter_y) { 281 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 282 for (int in_channel = 0; in_channel < input_depth; ++in_channel) { 283 const int in_x = in_x_origin + filter_x; 284 const int in_y = in_y_origin + filter_y; 285 // If the location is outside the bounds of the input image, 286 // use zero as a default value. 287 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && 288 (in_y < input_height)) { 289 int32 input_val = input_data[Offset(input_dims, in_channel, 290 in_x, in_y, batch)]; 291 int32 filter_val = 292 filter_data[Offset(filter_dims, in_channel, filter_x, 293 filter_y, out_channel)]; 294 acc += 295 (filter_val + filter_offset) * (input_val + input_offset); 296 } 297 } 298 } 299 } 300 if (bias_data) { 301 acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; 302 } 303 acc = MultiplyByQuantizedMultiplierSmallerThanOne( 304 acc, output_multiplier, output_shift); 305 acc += output_offset; 306 acc = std::max(acc, output_activation_min); 307 acc = std::min(acc, output_activation_max); 308 output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = 309 static_cast<uint8>(acc); 310 } 311 } 312 } 313 } 314 } 315 316 // legacy, for compatibility with old checked-in code 317 template <FusedActivationFunctionType Ac> 318 inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 319 int32 input_offset, const uint8* filter_data, 320 const Dims<4>& filter_dims, int32 filter_offset, 321 const int32* bias_data, const Dims<4>& bias_dims, 322 int stride_width, int stride_height, int pad_width, 323 int pad_height, int32 output_offset, int32 output_multiplier, 324 int output_shift, int32 output_activation_min, 325 int32 output_activation_max, uint8* output_data, 326 const Dims<4>& output_dims, uint8* im2col_data, 327 const Dims<4>& im2col_dims, 328 gemmlowp::GemmContext* gemm_context) { 329 static_assert(Ac == FusedActivationFunctionType::kNone || 330 Ac == FusedActivationFunctionType::kRelu || 331 Ac == FusedActivationFunctionType::kRelu6 || 332 Ac == FusedActivationFunctionType::kRelu1, 333 ""); 334 if (Ac == FusedActivationFunctionType::kNone) { 335 TFLITE_DCHECK_EQ(output_activation_min, 0); 336 TFLITE_DCHECK_EQ(output_activation_max, 255); 337 } 338 Conv(input_data, input_dims, input_offset, filter_data, filter_dims, 339 filter_offset, bias_data, bias_dims, stride_width, stride_height, 340 pad_width, pad_height, output_offset, output_multiplier, output_shift, 341 output_activation_min, output_activation_max, output_data, output_dims, 342 im2col_data, im2col_dims, gemm_context); 343 } 344 345 // legacy, for compatibility with old checked-in code 346 template <FusedActivationFunctionType Ac> 347 void Conv(const uint8* input_data, const Dims<4>& input_dims, 348 int32 input_offset, const uint8* filter_data, 349 const Dims<4>& filter_dims, int32 filter_offset, 350 const int32* bias_data, const Dims<4>& bias_dims, int stride, 351 int pad_width, int pad_height, int32 output_offset, 352 int32 output_multiplier, int output_shift, 353 int32 output_activation_min, int32 output_activation_max, 354 uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, 355 const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { 356 Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims, 357 filter_offset, bias_data, bias_dims, stride, stride, pad_width, 358 pad_height, output_offset, output_multiplier, output_shift, 359 output_activation_min, output_activation_max, output_data, 360 output_dims, im2col_data, im2col_dims, gemm_context); 361 } 362 363 template <typename T> 364 inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, 365 int block_size, T* output_data, 366 const Dims<4>& output_dims) { 367 const int input_depth = ArraySize(input_dims, 0); 368 const int input_width = ArraySize(input_dims, 1); 369 const int input_height = ArraySize(input_dims, 2); 370 const int input_batch = ArraySize(input_dims, 3); 371 372 const int output_depth = ArraySize(output_dims, 0); 373 const int output_width = ArraySize(output_dims, 1); 374 const int output_height = ArraySize(output_dims, 2); 375 const int output_batch = ArraySize(output_dims, 3); 376 377 TFLITE_DCHECK_EQ(input_width * block_size, output_width); 378 TFLITE_DCHECK_EQ(input_height * block_size, output_height); 379 TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size); 380 TFLITE_DCHECK_EQ(input_batch, output_batch); 381 382 for (int out_b = 0; out_b < output_batch; ++out_b) { 383 for (int out_h = 0; out_h < output_height; ++out_h) { 384 for (int out_w = 0; out_w < output_width; ++out_w) { 385 for (int out_d = 0; out_d < output_depth; ++out_d) { 386 const int in_d = 387 out_d + ((out_h % block_size) * block_size + out_w % block_size) * 388 output_depth; 389 const int in_w = out_w / block_size; 390 const int in_h = out_h / block_size; 391 const int in_b = out_b; 392 393 const int output_index = 394 Offset(output_dims, out_d, out_w, out_h, out_b); 395 const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); 396 397 output_data[output_index] = input_data[input_index]; 398 } 399 } 400 } 401 } 402 } 403 404 template <typename T> 405 inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, 406 int block_size, T* output_data, 407 const Dims<4>& output_dims) { 408 const int input_depth = ArraySize(input_dims, 0); 409 const int input_width = ArraySize(input_dims, 1); 410 const int input_height = ArraySize(input_dims, 2); 411 const int input_batch = ArraySize(input_dims, 3); 412 413 const int output_depth = ArraySize(output_dims, 0); 414 const int output_width = ArraySize(output_dims, 1); 415 const int output_height = ArraySize(output_dims, 2); 416 const int output_batch = ArraySize(output_dims, 3); 417 418 TFLITE_DCHECK_EQ(input_width, output_width * block_size); 419 TFLITE_DCHECK_EQ(input_height, output_height * block_size); 420 TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth); 421 TFLITE_DCHECK_EQ(input_batch, output_batch); 422 423 for (int in_b = 0; in_b < input_batch; ++in_b) { 424 for (int in_h = 0; in_h < input_height; ++in_h) { 425 for (int in_w = 0; in_w < input_width; ++in_w) { 426 for (int in_d = 0; in_d < input_depth; ++in_d) { 427 const int out_d = 428 in_d + ((in_h % block_size) * block_size + in_w % block_size) * 429 input_depth; 430 const int out_w = in_w / block_size; 431 const int out_h = in_h / block_size; 432 const int out_b = in_b; 433 434 const int output_index = 435 Offset(output_dims, out_d, out_w, out_h, out_b); 436 const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); 437 438 output_data[output_index] = input_data[input_index]; 439 } 440 } 441 } 442 } 443 } 444 445 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, 446 const float* weights_data, 447 const Dims<4>& weights_dims, const float* bias_data, 448 const Dims<4>& bias_dims, 449 float output_activation_min, 450 float output_activation_max, float* output_data, 451 const Dims<4>& output_dims) { 452 // TODO(benoitjacob): This really should be: 453 // const int batches = ArraySize(output_dims, 1); 454 // but the current --variable_batch hack consists in overwriting the 3rd 455 // dimension with the runtime batch size, as we don't keep track for each 456 // array of which dimension is the batch dimension in it. 457 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 458 ArraySize(output_dims, 3); 459 const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); 460 const int accum_depth = ArraySize(weights_dims, 0); 461 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 462 TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); 463 for (int b = 0; b < batches; ++b) { 464 for (int out_c = 0; out_c < output_depth; ++out_c) { 465 float total = 0.f; 466 for (int d = 0; d < accum_depth; ++d) { 467 total += input_data[b * accum_depth + d] * 468 weights_data[out_c * accum_depth + d]; 469 } 470 float bias_value = 0.0f; 471 if (bias_data) { 472 bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; 473 } 474 output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( 475 total + bias_value, output_activation_min, output_activation_max); 476 } 477 } 478 } 479 480 // legacy, for compatibility with old checked-in code 481 template <FusedActivationFunctionType Ac> 482 void FullyConnected(const float* input_data, const Dims<4>& input_dims, 483 const float* weights_data, const Dims<4>& weights_dims, 484 const float* bias_data, const Dims<4>& bias_dims, 485 float* output_data, const Dims<4>& output_dims) { 486 float output_activation_min, output_activation_max; 487 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 488 FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, 489 bias_dims, output_activation_min, output_activation_max, 490 output_data, output_dims); 491 } 492 493 inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 494 int32 input_offset, const uint8* filter_data, 495 const Dims<4>& filter_dims, int32 filter_offset, 496 const int32* bias_data, const Dims<4>& bias_dims, 497 int32 output_offset, int32 output_multiplier, 498 int output_shift, int32 output_activation_min, 499 int32 output_activation_max, uint8* output_data, 500 const Dims<4>& output_dims, 501 gemmlowp::GemmContext* gemm_context) { 502 (void)gemm_context; // only used in optimized code. 503 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 504 // TODO(benoitjacob): This really should be: 505 // const int batches = ArraySize(output_dims, 1); 506 // but the current --variable_batch hack consists in overwriting the 3rd 507 // dimension with the runtime batch size, as we don't keep track for each 508 // array of which dimension is the batch dimension in it. 509 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 510 ArraySize(output_dims, 3); 511 const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0); 512 const int accum_depth = ArraySize(filter_dims, 0); 513 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 514 TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); 515 for (int b = 0; b < batches; ++b) { 516 for (int out_c = 0; out_c < output_depth; ++out_c) { 517 int32 acc = 0; 518 for (int d = 0; d < accum_depth; ++d) { 519 int32 input_val = input_data[b * accum_depth + d]; 520 int32 filter_val = filter_data[out_c * accum_depth + d]; 521 acc += (filter_val + filter_offset) * (input_val + input_offset); 522 } 523 if (bias_data) { 524 acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; 525 } 526 acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier, 527 output_shift); 528 acc += output_offset; 529 acc = std::max(acc, output_activation_min); 530 acc = std::min(acc, output_activation_max); 531 output_data[out_c + output_depth * b] = static_cast<uint8>(acc); 532 } 533 } 534 } 535 536 // legacy, for compatibility with old checked-in code 537 template <FusedActivationFunctionType Ac> 538 void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 539 int32 input_offset, const uint8* filter_data, 540 const Dims<4>& filter_dims, int32 filter_offset, 541 const int32* bias_data, const Dims<4>& bias_dims, 542 int32 output_offset, int32 output_multiplier, 543 int output_shift, int32 output_activation_min, 544 int32 output_activation_max, uint8* output_data, 545 const Dims<4>& output_dims, 546 gemmlowp::GemmContext* gemm_context) { 547 static_assert(Ac == FusedActivationFunctionType::kNone || 548 Ac == FusedActivationFunctionType::kRelu || 549 Ac == FusedActivationFunctionType::kRelu6 || 550 Ac == FusedActivationFunctionType::kRelu1, 551 ""); 552 if (Ac == FusedActivationFunctionType::kNone) { 553 TFLITE_DCHECK_EQ(output_activation_min, 0); 554 TFLITE_DCHECK_EQ(output_activation_max, 255); 555 } 556 FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, 557 filter_offset, bias_data, bias_dims, output_offset, 558 output_multiplier, output_shift, output_activation_min, 559 output_activation_max, output_data, output_dims, gemm_context); 560 } 561 562 template <FusedActivationFunctionType Ac> 563 void NonGlobalBatchNormalization( 564 const float* input_data, const Dims<4>& input_dims, const float* mean_data, 565 const Dims<4>& mean_dims, const float* multiplier_data, 566 const Dims<4>& multiplier_dims, const float* offset_data, 567 const Dims<4>& offset_dims, float* output_data, 568 const Dims<4>& output_dims) { 569 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 570 const int height = 571 MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2, 572 offset_dims, 2, output_dims, 2); 573 const int width = 574 MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1, 575 offset_dims, 1, output_dims, 1); 576 const int depth = 577 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 578 offset_dims, 0, output_dims, 0); 579 580 for (int b = 0; b < batches; ++b) { 581 for (int y = 0; y < height; ++y) { 582 for (int x = 0; x < width; ++x) { 583 for (int c = 0; c < depth; ++c) { 584 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 585 (input_data[Offset(input_dims, c, x, y, b)] - 586 mean_data[Offset(mean_dims, c, x, y, 0)]) * 587 multiplier_data[Offset(multiplier_dims, c, x, y, 0)] + 588 offset_data[Offset(offset_dims, c, x, y, 0)]); 589 } 590 } 591 } 592 } 593 } 594 595 template <FusedActivationFunctionType Ac> 596 void GlobalBatchNormalization(const float* input_data, 597 const Dims<4>& input_dims, const float* mean_data, 598 const Dims<4>& mean_dims, 599 const float* multiplier_data, 600 const Dims<4>& multiplier_dims, 601 const float* offset_data, 602 const Dims<4>& offset_dims, float* output_data, 603 const Dims<4>& output_dims) { 604 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 605 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 606 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 607 const int depth = 608 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 609 offset_dims, 0, output_dims, 0); 610 611 for (int b = 0; b < batches; ++b) { 612 for (int y = 0; y < height; ++y) { 613 for (int x = 0; x < width; ++x) { 614 for (int c = 0; c < depth; ++c) { 615 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 616 (input_data[Offset(input_dims, c, x, y, b)] - 617 mean_data[Offset(mean_dims, c, 0, 0, 0)]) * 618 multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] + 619 offset_data[Offset(offset_dims, c, 0, 0, 0)]); 620 } 621 } 622 } 623 } 624 } 625 626 inline void Relu(const float* input_data, const Dims<4>& input_dims, 627 float* output_data, const Dims<4>& output_dims) { 628 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 629 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 630 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 631 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 632 for (int b = 0; b < batches; ++b) { 633 for (int y = 0; y < height; ++y) { 634 for (int x = 0; x < width; ++x) { 635 for (int c = 0; c < depth; ++c) { 636 float val = input_data[Offset(input_dims, c, x, y, b)]; 637 const float lower = 0; 638 float clamped = val < lower ? lower : val; 639 output_data[Offset(output_dims, c, x, y, b)] = clamped; 640 } 641 } 642 } 643 } 644 } 645 646 inline void Relu1(const float* input_data, const Dims<4>& input_dims, 647 float* output_data, const Dims<4>& output_dims) { 648 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 649 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 650 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 651 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 652 for (int b = 0; b < batches; ++b) { 653 for (int y = 0; y < height; ++y) { 654 for (int x = 0; x < width; ++x) { 655 for (int c = 0; c < depth; ++c) { 656 float val = input_data[Offset(input_dims, c, x, y, b)]; 657 const float upper = 1; 658 const float lower = -1; 659 float clamped = val > upper ? upper : val < lower ? lower : val; 660 output_data[Offset(output_dims, c, x, y, b)] = clamped; 661 } 662 } 663 } 664 } 665 } 666 667 inline void Relu6(const float* input_data, const Dims<4>& input_dims, 668 float* output_data, const Dims<4>& output_dims) { 669 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 670 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 671 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 672 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 673 for (int b = 0; b < batches; ++b) { 674 for (int y = 0; y < height; ++y) { 675 for (int x = 0; x < width; ++x) { 676 for (int c = 0; c < depth; ++c) { 677 float val = input_data[Offset(input_dims, c, x, y, b)]; 678 const float upper = 6; 679 const float lower = 0; 680 float clamped = val > upper ? upper : val < lower ? lower : val; 681 output_data[Offset(output_dims, c, x, y, b)] = clamped; 682 } 683 } 684 } 685 } 686 } 687 688 template <FusedActivationFunctionType Ac> 689 void L2Normalization(const float* input_data, const Dims<4>& input_dims, 690 float* output_data, const Dims<4>& output_dims) { 691 static_assert(Ac == FusedActivationFunctionType::kNone, ""); 692 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 693 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 694 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 695 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 696 for (int b = 0; b < batches; ++b) { 697 for (int y = 0; y < height; ++y) { 698 for (int x = 0; x < width; ++x) { 699 float squared_l2_norm = 0; 700 for (int c = 0; c < depth; ++c) { 701 float val = input_data[Offset(input_dims, c, x, y, b)]; 702 squared_l2_norm += val * val; 703 } 704 float l2_norm = std::sqrt(squared_l2_norm); 705 for (int c = 0; c < depth; ++c) { 706 output_data[Offset(output_dims, c, x, y, b)] = 707 input_data[Offset(input_dims, c, x, y, b)] / l2_norm; 708 } 709 } 710 } 711 } 712 } 713 714 inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, 715 int* output_shift) { 716 *output_shift = 11; 717 while (input >= (1 << 29)) { 718 input /= 4; 719 ++*output_shift; 720 } 721 TFLITE_DCHECK_GT(input, 0); 722 const unsigned max_left_shift_bits = __builtin_clz(input) - 1; 723 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; 724 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; 725 *output_shift -= left_shift_bit_pairs; 726 input <<= 2 * left_shift_bit_pairs; 727 TFLITE_DCHECK_GE(input, (1 << 27)); 728 TFLITE_DCHECK_LT(input, (1 << 29)); 729 using gemmlowp::FixedPoint; 730 using gemmlowp::Rescale; 731 using gemmlowp::SaturatingRoundingMultiplyByPOT; 732 // Using 3 integer bits gives us enough room for the internal arithmetic in 733 // this Newton-Raphson iteration. 734 using F3 = FixedPoint<int32, 3>; 735 using F0 = FixedPoint<int32, 0>; 736 const F3 fixedpoint_input = F3::FromRaw(input >> 1); 737 const F3 fixedpoint_half_input = 738 SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); 739 const F3 fixedpoint_half_three = 740 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); 741 // Newton-Raphson iteration 742 // Naive unoptimized starting guess: x = 1 743 F3 x = F3::One(); 744 // Naive unoptimized number of iterations: 5 745 for (int i = 0; i < 5; i++) { 746 const F3 x3 = Rescale<3>(x * x * x); 747 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); 748 } 749 const F0 fixedpoint_half_sqrt_2 = 750 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); 751 x = x * fixedpoint_half_sqrt_2; 752 *output_inv_sqrt = x.raw(); 753 if (*output_shift < 0) { 754 *output_inv_sqrt <<= -*output_shift; 755 *output_shift = 0; 756 } 757 } 758 759 inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, 760 int32 input_zero_point, uint8* output_data, 761 const Dims<4>& output_dims) { 762 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 763 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 764 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 765 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 766 TFLITE_DCHECK_EQ(batches, 1); 767 TFLITE_DCHECK_EQ(height, 1); 768 TFLITE_DCHECK_EQ(width, 1); 769 int32 square_l2_norm = 0; 770 for (int i = 0; i < depth; i++) { 771 int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; 772 square_l2_norm += diff * diff; 773 } 774 int32 inv_l2norm_multiplier; 775 int inv_l2norm_shift; 776 GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, 777 &inv_l2norm_shift); 778 779 for (int i = 0; i < depth; i++) { 780 int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; 781 int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( 782 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); 783 int32 unclamped_output_val = 128 + rescaled_diff; 784 int32 output_val = std::min(255, std::max(0, unclamped_output_val)); 785 output_data[Offset(output_dims, i, 0, 0, 0)] = 786 static_cast<uint8>(output_val); 787 } 788 } 789 790 inline void Add(const float* input1_data, const Dims<4>& input1_dims, 791 const float* input2_data, const Dims<4>& input2_dims, 792 float output_activation_min, float output_activation_max, 793 float* output_data, const Dims<4>& output_dims) { 794 const int batches = 795 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 796 const int height = 797 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 798 const int width = 799 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 800 const int depth = 801 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 802 for (int b = 0; b < batches; ++b) { 803 for (int y = 0; y < height; ++y) { 804 for (int x = 0; x < width; ++x) { 805 for (int c = 0; c < depth; ++c) { 806 output_data[Offset(output_dims, c, x, y, b)] = 807 ActivationFunctionWithMinMax( 808 input1_data[Offset(input1_dims, c, x, y, b)] + 809 input2_data[Offset(input2_dims, c, x, y, b)], 810 output_activation_min, output_activation_max); 811 } 812 } 813 } 814 } 815 } 816 817 // legacy, for compatibility with old checked-in code 818 template <FusedActivationFunctionType Ac> 819 void Add(const float* input1_data, const Dims<4>& input1_dims, 820 const float* input2_data, const Dims<4>& input2_dims, 821 float* output_data, const Dims<4>& output_dims) { 822 float output_activation_min, output_activation_max; 823 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 824 825 Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 826 output_activation_max, output_data, output_dims); 827 } 828 829 template <FusedActivationFunctionType Ac> 830 inline void Add(int left_shift, const uint8* input1_data, 831 const Dims<4>& input1_dims, int32 input1_offset, 832 int32 input1_multiplier, int input1_shift, 833 const uint8* input2_data, const Dims<4>& input2_dims, 834 int32 input2_offset, int32 input2_multiplier, int input2_shift, 835 int32 output_offset, int32 output_multiplier, int output_shift, 836 int32 output_activation_min, int32 output_activation_max, 837 uint8* output_data, const Dims<4>& output_dims) { 838 static_assert(Ac == FusedActivationFunctionType::kNone || 839 Ac == FusedActivationFunctionType::kRelu || 840 Ac == FusedActivationFunctionType::kRelu6 || 841 Ac == FusedActivationFunctionType::kRelu1, 842 ""); 843 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 844 if (Ac == FusedActivationFunctionType::kNone) { 845 TFLITE_DCHECK_EQ(output_activation_min, 0); 846 TFLITE_DCHECK_EQ(output_activation_max, 255); 847 } 848 const int batches = 849 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 850 const int height = 851 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 852 const int width = 853 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 854 const int depth = 855 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 856 for (int b = 0; b < batches; ++b) { 857 for (int y = 0; y < height; ++y) { 858 for (int x = 0; x < width; ++x) { 859 for (int c = 0; c < depth; ++c) { 860 const int32 input1_val = 861 input1_offset + input1_data[Offset(input1_dims, c, x, y, b)]; 862 const int32 input2_val = 863 input2_offset + input2_data[Offset(input2_dims, c, x, y, b)]; 864 const int32 shifted_input1_val = input1_val * (1 << left_shift); 865 const int32 shifted_input2_val = input2_val * (1 << left_shift); 866 const int32 scaled_input1_val = 867 MultiplyByQuantizedMultiplierSmallerThanOne( 868 shifted_input1_val, input1_multiplier, input1_shift); 869 const int32 scaled_input2_val = 870 MultiplyByQuantizedMultiplierSmallerThanOne( 871 shifted_input2_val, input2_multiplier, input2_shift); 872 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 873 const int32 raw_output = 874 MultiplyByQuantizedMultiplierSmallerThanOne( 875 raw_sum, output_multiplier, output_shift) + 876 output_offset; 877 const int32 clamped_output = 878 std::min(output_activation_max, 879 std::max(output_activation_min, raw_output)); 880 output_data[Offset(output_dims, c, x, y, b)] = 881 static_cast<uint8>(clamped_output); 882 } 883 } 884 } 885 } 886 } 887 888 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary 889 // dimensionality if the runtime code does a single loop over one dimension 890 // that handles broadcasting as the base case. The code generator would then 891 // generate max(D1, D2) nested for loops. 892 template <typename T> 893 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, 894 const T* input2_data, const Dims<4>& input2_dims, 895 T output_activation_min, T output_activation_max, 896 T* output_data, const Dims<4>& output_dims) { 897 gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); 898 899 NdArrayDesc<4> desc1; 900 NdArrayDesc<4> desc2; 901 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 902 903 // In Tensorflow, the dimensions are canonically named (batch_number, row, 904 // col, channel), with extents (batches, height, width, depth), with the 905 // trailing dimension changing most rapidly (channels has the smallest stride, 906 // typically 1 element). 907 // 908 // In generated C code, we store arrays with the dimensions reversed. The 909 // first dimension has smallest stride. 910 // 911 // We name our variables by their Tensorflow convention, but generate C code 912 // nesting loops such that the innermost loop has the smallest stride for the 913 // best cache behavior. 914 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 915 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 916 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 917 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 918 output_data[Offset(output_dims, c, x, y, b)] = 919 ActivationFunctionWithMinMax( 920 input1_data[SubscriptToIndex(desc1, c, x, y, b)] + 921 input2_data[SubscriptToIndex(desc2, c, x, y, b)], 922 output_activation_min, output_activation_max); 923 } 924 } 925 } 926 } 927 } 928 929 // legacy, for compatibility with old checked-in code 930 template <FusedActivationFunctionType Ac, typename T> 931 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, 932 const T* input2_data, const Dims<4>& input2_dims, 933 T* output_data, const Dims<4>& output_dims) { 934 T output_activation_min, output_activation_max; 935 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 936 937 BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, 938 output_activation_min, output_activation_max, output_data, 939 output_dims); 940 } 941 942 inline void BroadcastAdd(int left_shift, const uint8* input1_data, 943 const Dims<4>& input1_dims, int32 input1_offset, 944 int32 input1_multiplier, int input1_shift, 945 const uint8* input2_data, const Dims<4>& input2_dims, 946 int32 input2_offset, int32 input2_multiplier, 947 int input2_shift, int32 output_offset, 948 int32 output_multiplier, int output_shift, 949 int32 output_activation_min, 950 int32 output_activation_max, uint8* output_data, 951 const Dims<4>& output_dims) { 952 gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); 953 954 NdArrayDesc<4> desc1; 955 NdArrayDesc<4> desc2; 956 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 957 958 // In Tensorflow, the dimensions are canonically named (batch_number, row, 959 // col, channel), with extents (batches, height, width, depth), with the 960 // trailing dimension changing most rapidly (channels has the smallest stride, 961 // typically 1 element). 962 // 963 // In generated C code, we store arrays with the dimensions reversed. The 964 // first dimension has smallest stride. 965 // 966 // We name our variables by their Tensorflow convention, but generate C code 967 // nesting loops such that the innermost loop has the smallest stride for the 968 // best cache behavior. 969 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 970 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 971 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 972 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 973 const int32 input1_val = 974 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 975 const int32 input2_val = 976 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 977 const int32 shifted_input1_val = input1_val * (1 << left_shift); 978 const int32 shifted_input2_val = input2_val * (1 << left_shift); 979 const int32 scaled_input1_val = 980 MultiplyByQuantizedMultiplierSmallerThanOne( 981 shifted_input1_val, input1_multiplier, input1_shift); 982 const int32 scaled_input2_val = 983 MultiplyByQuantizedMultiplierSmallerThanOne( 984 shifted_input2_val, input2_multiplier, input2_shift); 985 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 986 const int32 raw_output = 987 MultiplyByQuantizedMultiplierSmallerThanOne( 988 raw_sum, output_multiplier, output_shift) + 989 output_offset; 990 const int32 clamped_output = 991 std::min(output_activation_max, 992 std::max(output_activation_min, raw_output)); 993 output_data[Offset(output_dims, c, x, y, b)] = 994 static_cast<uint8>(clamped_output); 995 } 996 } 997 } 998 } 999 } 1000 1001 template <FusedActivationFunctionType Ac> 1002 inline void BroadcastAdd(int left_shift, const uint8* input1_data, 1003 const Dims<4>& input1_dims, int32 input1_offset, 1004 int32 input1_multiplier, int input1_shift, 1005 const uint8* input2_data, const Dims<4>& input2_dims, 1006 int32 input2_offset, int32 input2_multiplier, 1007 int input2_shift, int32 output_offset, 1008 int32 output_multiplier, int output_shift, 1009 int32 output_activation_min, 1010 int32 output_activation_max, uint8* output_data, 1011 const Dims<4>& output_dims) { 1012 static_assert(Ac == FusedActivationFunctionType::kNone || 1013 Ac == FusedActivationFunctionType::kRelu || 1014 Ac == FusedActivationFunctionType::kRelu6 || 1015 Ac == FusedActivationFunctionType::kRelu1, 1016 ""); 1017 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1018 if (Ac == FusedActivationFunctionType::kNone) { 1019 TFLITE_DCHECK_EQ(output_activation_min, 0); 1020 TFLITE_DCHECK_EQ(output_activation_max, 255); 1021 } 1022 BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, 1023 input1_multiplier, input1_shift, input2_data, input2_dims, 1024 input2_offset, input2_multiplier, input2_shift, output_offset, 1025 output_multiplier, output_shift, output_activation_min, 1026 output_activation_max, output_data, output_dims); 1027 } 1028 1029 inline void Mul(const float* input1_data, const Dims<4>& input1_dims, 1030 const float* input2_data, const Dims<4>& input2_dims, 1031 float output_activation_min, float output_activation_max, 1032 float* output_data, const Dims<4>& output_dims) { 1033 const int batches = 1034 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1035 const int height = 1036 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1037 const int width = 1038 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1039 const int depth = 1040 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1041 for (int b = 0; b < batches; ++b) { 1042 for (int y = 0; y < height; ++y) { 1043 for (int x = 0; x < width; ++x) { 1044 for (int c = 0; c < depth; ++c) { 1045 output_data[Offset(output_dims, c, x, y, b)] = 1046 ActivationFunctionWithMinMax( 1047 input1_data[Offset(input1_dims, c, x, y, b)] * 1048 input2_data[Offset(input2_dims, c, x, y, b)], 1049 output_activation_min, output_activation_max); 1050 } 1051 } 1052 } 1053 } 1054 } 1055 1056 // legacy, for compatibility with old checked-in code 1057 template <FusedActivationFunctionType Ac> 1058 void Mul(const float* input1_data, const Dims<4>& input1_dims, 1059 const float* input2_data, const Dims<4>& input2_dims, 1060 float* output_data, const Dims<4>& output_dims) { 1061 float output_activation_min, output_activation_max; 1062 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1063 1064 Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 1065 output_activation_max, output_data, output_dims); 1066 } 1067 1068 // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary 1069 // dimensionality if the runtime code does a single loop over one dimension 1070 // that handles broadcasting as the base case. The code generator would then 1071 // generate max(D1, D2) nested for loops. 1072 template <typename T> 1073 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, 1074 const T* input2_data, const Dims<4>& input2_dims, 1075 T output_activation_min, T output_activation_max, 1076 T* output_data, const Dims<4>& output_dims) { 1077 gemmlowp::ScopedProfilingLabel label("BroadcastMul"); 1078 1079 NdArrayDesc<4> desc1; 1080 NdArrayDesc<4> desc2; 1081 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1082 1083 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1084 // col, channel), with extents (batches, height, width, depth), with the 1085 // trailing dimension changing most rapidly (channels has the smallest 1086 // stride, typically 1 element). 1087 // 1088 // In generated C code, we store arrays with the dimensions reversed. The 1089 // first dimension has smallest stride. 1090 // 1091 // We name our variables by their Tensorflow convention, but generate C code 1092 // nesting loops such that the innermost loop has the smallest stride for 1093 // the best cache behavior. 1094 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1095 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1096 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1097 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1098 output_data[Offset(output_dims, c, x, y, b)] = 1099 ActivationFunctionWithMinMax( 1100 input1_data[SubscriptToIndex(desc1, c, x, y, b)] * 1101 input2_data[SubscriptToIndex(desc2, c, x, y, b)], 1102 output_activation_min, output_activation_max); 1103 } 1104 } 1105 } 1106 } 1107 } 1108 1109 // legacy, for compatibility with old checked-in code 1110 template <FusedActivationFunctionType Ac, typename T> 1111 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, 1112 const T* input2_data, const Dims<4>& input2_dims, 1113 T* output_data, const Dims<4>& output_dims) { 1114 T output_activation_min, output_activation_max; 1115 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1116 1117 BroadcastMul(input1_data, input1_dims, input2_data, input2_dims, 1118 output_activation_min, output_activation_max, output_data, 1119 output_dims); 1120 } 1121 1122 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1123 int32 input1_offset, const uint8* input2_data, 1124 const Dims<4>& input2_dims, int32 input2_offset, 1125 int32 output_offset, int32 output_multiplier, 1126 int output_shift, int32 output_activation_min, 1127 int32 output_activation_max, uint8* output_data, 1128 const Dims<4>& output_dims) { 1129 gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); 1130 1131 NdArrayDesc<4> desc1; 1132 NdArrayDesc<4> desc2; 1133 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1134 1135 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1136 // col, channel), with extents (batches, height, width, depth), with the 1137 // trailing dimension changing most rapidly (channels has the smallest 1138 // stride, typically 1 element). 1139 // 1140 // In generated C code, we store arrays with the dimensions reversed. The 1141 // first dimension has smallest stride. 1142 // 1143 // We name our variables by their Tensorflow convention, but generate C code 1144 // nesting loops such that the innermost loop has the smallest stride for 1145 // the best cache behavior. 1146 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1147 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1148 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1149 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1150 const int32 input1_val = 1151 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 1152 const int32 input2_val = 1153 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 1154 const int32 unclamped_result = 1155 output_offset + 1156 MultiplyByQuantizedMultiplierSmallerThanOne( 1157 input1_val * input2_val, output_multiplier, output_shift); 1158 const int32 clamped_output = 1159 std::min(output_activation_max, 1160 std::max(output_activation_min, unclamped_result)); 1161 output_data[Offset(output_dims, c, x, y, b)] = 1162 static_cast<uint8>(clamped_output); 1163 } 1164 } 1165 } 1166 } 1167 } 1168 1169 // legacy, for compatibility with old checked-in code 1170 template <FusedActivationFunctionType Ac> 1171 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1172 int32 input1_offset, const uint8* input2_data, 1173 const Dims<4>& input2_dims, int32 input2_offset, 1174 int32 output_offset, int32 output_multiplier, 1175 int output_shift, int32 output_activation_min, 1176 int32 output_activation_max, uint8* output_data, 1177 const Dims<4>& output_dims) { 1178 BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, 1179 input2_dims, input2_offset, output_offset, output_multiplier, 1180 output_shift, output_activation_min, output_activation_max, 1181 output_data, output_dims); 1182 } 1183 1184 inline void Div(const float* input1_data, const Dims<4>& input1_dims, 1185 const float* input2_data, const Dims<4>& input2_dims, 1186 float output_activation_min, float output_activation_max, 1187 float* output_data, const Dims<4>& output_dims) { 1188 const int batches = 1189 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1190 const int height = 1191 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1192 const int width = 1193 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1194 const int depth = 1195 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1196 for (int b = 0; b < batches; ++b) { 1197 for (int y = 0; y < height; ++y) { 1198 for (int x = 0; x < width; ++x) { 1199 for (int c = 0; c < depth; ++c) { 1200 output_data[Offset(output_dims, c, x, y, b)] = 1201 ActivationFunctionWithMinMax( 1202 input1_data[Offset(input1_dims, c, x, y, b)] / 1203 input2_data[Offset(input2_dims, c, x, y, b)], 1204 output_activation_min, output_activation_max); 1205 } 1206 } 1207 } 1208 } 1209 } 1210 1211 inline void Sub(const float* input1_data, const Dims<4>& input1_dims, 1212 const float* input2_data, const Dims<4>& input2_dims, 1213 float output_activation_min, float output_activation_max, 1214 float* output_data, const Dims<4>& output_dims) { 1215 const int batches = 1216 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1217 const int height = 1218 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1219 const int width = 1220 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1221 const int depth = 1222 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1223 for (int b = 0; b < batches; ++b) { 1224 for (int y = 0; y < height; ++y) { 1225 for (int x = 0; x < width; ++x) { 1226 for (int c = 0; c < depth; ++c) { 1227 output_data[Offset(output_dims, c, x, y, b)] = 1228 ActivationFunctionWithMinMax( 1229 input1_data[Offset(input1_dims, c, x, y, b)] - 1230 input2_data[Offset(input2_dims, c, x, y, b)], 1231 output_activation_min, output_activation_max); 1232 } 1233 } 1234 } 1235 } 1236 } 1237 1238 template <FusedActivationFunctionType Ac, typename Scalar> 1239 void Concatenation(int concat_dim, const Scalar* const* input_data, 1240 const Dims<4>* const* input_dims, int inputs_count, 1241 Scalar* output_data, const Dims<4>& output_dims) { 1242 TFLITE_DCHECK_GT(inputs_count, 1); 1243 int concat_size = 0; 1244 for (int i = 0; i < inputs_count; i++) { 1245 for (int j = 0; j < 4; j++) { 1246 if (j != concat_dim) { 1247 MatchingArraySize(*input_dims[i], j, output_dims, j); 1248 } 1249 } 1250 concat_size += ArraySize(*input_dims[i], concat_dim); 1251 } 1252 TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); 1253 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1254 int outer_size = 1; 1255 for (int i = concat_dim + 1; i < 4; i++) { 1256 outer_size *= output_dims.sizes[i]; 1257 } 1258 Scalar* output_ptr = output_data; 1259 for (int k = 0; k < outer_size; k++) { 1260 for (int i = 0; i < inputs_count; ++i) { 1261 const int copy_size = 1262 input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; 1263 memcpy(output_ptr, input_data[i] + k * copy_size, 1264 copy_size * sizeof(Scalar)); 1265 output_ptr += copy_size; 1266 } 1267 } 1268 } 1269 1270 template <FusedActivationFunctionType Ac, typename Scalar> 1271 void DepthConcatenation(const Scalar* const* input_data, 1272 const Dims<4>* const* input_dims, int inputs_count, 1273 Scalar* output_data, const Dims<4>& output_dims) { 1274 Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, 1275 output_data, output_dims); 1276 } 1277 1278 inline void LstmCell(const float* input_data, const Dims<4>& input_dims, 1279 const float* prev_activ_data, 1280 const Dims<4>& prev_activ_dims, const float* weights_data, 1281 const Dims<4>& weights_dims, const float* bias_data, 1282 const Dims<4>& bias_dims, const float* prev_state_data, 1283 const Dims<4>& prev_state_dims, float* output_state_data, 1284 const Dims<4>& output_state_dims, float* output_activ_data, 1285 const Dims<4>& output_activ_dims, float* concat_temp_data, 1286 const Dims<4>& concat_temp_dims, float* activ_temp_data, 1287 const Dims<4>& activ_temp_dims) { 1288 const int batches = 1289 MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, 1290 output_state_dims, 3, output_activ_dims, 3); 1291 const int height = 1292 MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, 1293 output_state_dims, 2, output_activ_dims, 2); 1294 const int width = 1295 MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, 1296 output_state_dims, 1, output_activ_dims, 1); 1297 TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); 1298 TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); 1299 const int input_depth = ArraySize(input_dims, 0); 1300 const int prev_activ_depth = ArraySize(prev_activ_dims, 0); 1301 const int total_input_depth = prev_activ_depth + input_depth; 1302 TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); 1303 TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1304 1); 1305 const int intern_activ_depth = 1306 MatchingArraySize(weights_dims, 1, bias_dims, 0); 1307 TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); 1308 const int output_depth = 1309 MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, 1310 output_state_dims, 0, output_activ_dims, 0); 1311 TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); 1312 1313 // Concatenate prev_activ and input data together 1314 std::vector<float const*> concat_input_arrays_data; 1315 std::vector<Dims<4> const*> concat_input_arrays_dims; 1316 concat_input_arrays_data.push_back(input_data); 1317 concat_input_arrays_data.push_back(prev_activ_data); 1318 concat_input_arrays_dims.push_back(&input_dims); 1319 concat_input_arrays_dims.push_back(&prev_activ_dims); 1320 Concatenation<FusedActivationFunctionType::kNone, float>( 1321 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), 1322 concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); 1323 1324 // Fully connected 1325 FullyConnected<FusedActivationFunctionType::kNone>( 1326 concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, 1327 bias_dims, activ_temp_data, activ_temp_dims); 1328 1329 // Memory state update (the LSTM "guts") 1330 for (int b = 0; b < batches; ++b) { 1331 for (int w = 0; w < width; ++w) { 1332 for (int h = 0; h < height; ++h) { 1333 for (int c = 0; c < output_depth; ++c) { 1334 const float input_gate = 1335 1.f / 1336 (1.f + std::exp(-activ_temp_data[Offset( 1337 activ_temp_dims, 0 * output_depth + c, w, h, b)])); 1338 const float new_input = std::tanh(activ_temp_data[Offset( 1339 activ_temp_dims, 1 * output_depth + c, w, h, b)]); 1340 const float forget_gate = 1341 1.f / 1342 (1.f + std::exp(-activ_temp_data[Offset( 1343 activ_temp_dims, 2 * output_depth + c, w, h, b)])); 1344 const float output_gate = 1345 1.f / 1346 (1.f + std::exp(-activ_temp_data[Offset( 1347 activ_temp_dims, 3 * output_depth + c, w, h, b)])); 1348 const float new_state = 1349 input_gate * new_input + 1350 forget_gate * 1351 prev_state_data[Offset(prev_state_dims, c, w, h, b)]; 1352 output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state; 1353 output_activ_data[Offset(output_activ_dims, c, w, h, b)] = 1354 output_gate * std::tanh(new_state); 1355 } 1356 } 1357 } 1358 } 1359 } 1360 1361 // Quantized LSTM cell implementation. 1362 // The quantization of the input, output arrays is as follows: 1363 // - The input activations are quantized as uint8 on the interval 1364 // [-1, 127/128]. 1365 // The rationale for that is that that is the natural interval for output 1366 // activations (see next point) and these need to be concatenated together. 1367 // We could accommodate different ranges by re-scaling, but we empirically 1368 // found that setting the input activations range to be [-1, 127/128] in the 1369 // first place, removing the need for re-scaling, greatly improves accuracy. 1370 // - The output activations are quantized as uint8 on the interval 1371 // [-1, 127/128]. 1372 // The rationale for that is that the definition of a LSTM cell makes them 1373 // intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128] 1374 // makes for simpler, more accurate fixed-point arithmetic. 1375 // - The output-at-previous-timestep state array is obviously quantized as 1376 // the output activations. 1377 // - The internal LSTM memory (not the output-at-previous-timestep, the other 1378 // internal state array) is int16-quantized and may use any power-of-two, 1379 // symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call 1380 // StateIntegerBits below, see the below discussion of that template 1381 // parameter ("The StateIntegerBits template parameter"). 1382 // - The output of the internal fully-connected node is int16-quantized 1383 // on the interval [-8, 8 * 32767/32768], the rationale for which is 1384 // explained just below ("Why [-8, 8] for fully-connected output?"). 1385 // 1386 // 1387 // === The StateIntegerBits template parameter === 1388 // 1389 // The StateIntegerBits template parameter controls the fixed-point format used 1390 // to represent the internal memory of the LSTM cell (not the 1391 // output-at-previous-timestep, the other internal state array). It's currently 1392 // a template parameter so that the model can control that. The most typical 1393 // value for StateIntegerBits is 4. Other plausible values are anywhere between 1394 // 3 and 5. We might eventually standardize on a single supported value, e.g. 4, 1395 // and drop that template parameter. The reason why it can't be a runtime 1396 // parameter is that this controls the fixed-point format used, i.e. we need to 1397 // generate actually different code based on it. In particular, we generate code 1398 // for a fixed-point tanh() implementation for that format, which internally 1399 // uses a fixed-point exp() implementation, which internally uses a 1400 // barrel-shifter with a number of steps that depends on StateIntegerBits. 1401 // Another consequence of that is that a higher value of StateIntegerBits 1402 // results in a more expensive implementation (more barrel shifter steps 1403 // needed). 1404 // 1405 // 1406 // === Why [-8, 8] for fully-connected output? === 1407 // 1408 // This array is only fed to Logistic and Tanh functions, for which 1409 // the quantized implementation will want to use fixed-point arithmetic, 1410 // requiring a power-of-two representation interval. Thus, we should right 1411 // away quantize this array to a power-of-two interval; otherwise, 1412 // implementation will need to rescale that, losing any benefit that a tighter 1413 // representation interval might otherwise yield, while introducting some 1414 // numerical error and computational overhead. 1415 // 1416 // Now, Logistic and Tanh 1417 // are nearly constant (nearly equal to their horizontal asymptotes) 1418 // outside of a small bounded interval around 0: 1419 // 1420 // Logistic(4) = 1 - 1.8e-2 Tanh(4) = 1 - 6.7e-4 1421 // Logistic(8) = 1 - 3.4e-4 Tanh(8) = 1 - 2.3e-7 1422 // Logistic(16) = 1 - 1.1e-7 Tanh(16) = 1 - 2.5e-14 1423 // 1424 // From this, we see that clamping to [-4, 4] would be too inaccurate 1425 // (the error of 1.8e-2 on Logistic would be felt even in 8bit precision) 1426 // while clamping to [-16, 16] would make no difference even in float32. 1427 // However, for a fixed-point implementation in 16-bit integers, using 5 1428 // integer bits to represent the [-16, 16] range would leave only 11 1429 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive 1430 // representable values. Notice that that is higher than the 1431 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic. 1432 // Using [-8, 8] thus seems like the better compromise overall, enjoying 1433 // an increment of 2.4e-4 between representable values and a worst-case 1434 // clamping error of 3.4e-4, both better than the increment of 4.9e-4 with 1435 // [-16, 16]. 1436 // 1437 // Moreover, all other things being equal, it is nice to choose the narrower 1438 // representation range, as that makes the implementation of fixed-point 1439 // math functions a little cheaper (each integer bit requires an additional 1440 // barrel-shifter atep in the implementation of exp(-x)). That is further 1441 // reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make 1442 // sense for 32-bit float or 32-bit fixed-point quantization, but we are 1443 // aiming for 16-bit fixed-point quantization of these internal nodes here. 1444 // 1445 template <int StateIntegerBits> 1446 void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, 1447 const uint8* prev_activ_data_uint8, 1448 const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, 1449 const Dims<4>& weights_dims, const int32* bias_data_int32, 1450 const Dims<4>& bias_dims, const int16* prev_state_data_int16, 1451 const Dims<4>& prev_state_dims, int16* output_state_data_int16, 1452 const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, 1453 const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, 1454 const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, 1455 const Dims<4>& activ_temp_dims, int32 weights_zero_point, 1456 int32 accum_multiplier, int accum_shift, 1457 gemmlowp::GemmContext* gemm_context) { 1458 (void)gemm_context; // only used in optimized code. 1459 1460 // Gather dimensions information, and perform consistency checks. 1461 const int batches = 1462 MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, 1463 output_state_dims, 3, output_activ_dims, 3); 1464 const int height = 1465 MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, 1466 output_state_dims, 2, output_activ_dims, 2); 1467 const int width = 1468 MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, 1469 output_state_dims, 1, output_activ_dims, 1); 1470 TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); 1471 TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); 1472 const int input_depth = ArraySize(input_dims, 0); 1473 const int prev_activ_depth = ArraySize(prev_activ_dims, 0); 1474 const int total_input_depth = prev_activ_depth + input_depth; 1475 TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); 1476 TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1477 1); 1478 const int intern_activ_depth = 1479 MatchingArraySize(weights_dims, 1, bias_dims, 0); 1480 TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); 1481 const int output_depth = 1482 MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, 1483 output_state_dims, 0, output_activ_dims, 0); 1484 TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); 1485 const int fc_batches = ArraySize(activ_temp_dims, 1) * 1486 ArraySize(activ_temp_dims, 2) * 1487 ArraySize(activ_temp_dims, 3); 1488 const int fc_output_depth = 1489 MatchingArraySize(weights_dims, 1, activ_temp_dims, 0); 1490 const int fc_accum_depth = ArraySize(weights_dims, 0); 1491 TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth); 1492 1493 // Depth-concatenate prev_activ and input data together. 1494 uint8 const* concat_input_arrays_data[2] = {input_data_uint8, 1495 prev_activ_data_uint8}; 1496 Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims}; 1497 Concatenation<FusedActivationFunctionType::kNone, uint8>( 1498 0, concat_input_arrays_data, concat_input_arrays_dims, 2, 1499 concat_temp_data_uint8, concat_temp_dims); 1500 1501 // Implementation of the fully connected node inside the LSTM cell. 1502 // The operands are 8-bit integers, the accumulators are internally 32bit 1503 // integers, and the output is 16-bit fixed-point with 3 integer bits so 1504 // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that 1505 // is explained in the function comment above. 1506 for (int b = 0; b < fc_batches; ++b) { 1507 for (int out_c = 0; out_c < fc_output_depth; ++out_c) { 1508 // Internal accumulation. 1509 // Initialize accumulator with the bias-value. 1510 int32 accum = bias_data_int32[out_c]; 1511 // Accumulation loop. 1512 for (int d = 0; d < fc_accum_depth; ++d) { 1513 int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128; 1514 int16 weights_val = 1515 weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point; 1516 accum += input_val * weights_val; 1517 } 1518 // Down-scale the final int32 accumulator to the scale used by our 1519 // (16-bit, using 3 integer bits) fixed-point format. The quantized 1520 // multiplier and shift here have been pre-computed offline 1521 // (e.g. by toco). 1522 accum = 1523 MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift); 1524 // Saturate, cast to int16, and store to the temporary activations array. 1525 accum = std::max(-32768, std::min(32767, accum)); 1526 activ_temp_data_int16[out_c + fc_output_depth * b] = accum; 1527 } 1528 } 1529 1530 // Rest of the LSTM cell: tanh and logistic math functions, and some adds 1531 // and muls, all done in 16-bit fixed-point. 1532 const int outer_size = batches * width * height; 1533 for (int b = 0; b < outer_size; ++b) { 1534 for (int c = 0; c < output_depth; ++c) { 1535 // Define the fixed-point data types that we will use here. All use 1536 // int16 as the underlying integer type i.e. all are 16-bit fixed-point. 1537 // They only differ by the number of integral vs. fractional bits, 1538 // determining the range of values that they can represent. 1539 // 1540 // F0 uses 0 integer bits, range [-1, 1]. 1541 // This is the return type of math functions such as tanh, logistic, 1542 // whose range is in [-1, 1]. 1543 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; 1544 // F3 uses 3 integer bits, range [-8, 8]. 1545 // This is the range of the previous fully-connected node's output, 1546 // which is our input here. 1547 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>; 1548 // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits, 1549 // 2^StateIntegerBits]. It's used to represent the internal state, whose 1550 // number of integer bits is currently dictated by the model. See comment 1551 // on the StateIntegerBits template parameter above. 1552 using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>; 1553 // Implementation of input gate, using fixed-point logistic function. 1554 F3 input_gate_input = F3::FromRaw( 1555 activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]); 1556 F0 input_gate_output = gemmlowp::logistic(input_gate_input); 1557 // Implementation of input modulation gate, using fixed-point tanh 1558 // function. 1559 F3 input_modulation_gate_input = F3::FromRaw( 1560 activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]); 1561 F0 input_modulation_gate_output = 1562 gemmlowp::tanh(input_modulation_gate_input); 1563 // Implementation of forget gate, using fixed-point logistic function. 1564 F3 forget_gate_input = F3::FromRaw( 1565 activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]); 1566 F0 forget_gate_output = gemmlowp::logistic(forget_gate_input); 1567 // Implementation of output gate, using fixed-point logistic function. 1568 F3 output_gate_input = F3::FromRaw( 1569 activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]); 1570 F0 output_gate_output = gemmlowp::logistic(output_gate_input); 1571 // Implementation of internal multiplication nodes, still in fixed-point. 1572 F0 input_times_input_modulation = 1573 input_gate_output * input_modulation_gate_output; 1574 FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]); 1575 FS prev_state_times_forget_state = forget_gate_output * prev_state; 1576 // Implementation of internal addition node, saturating. 1577 FS new_state = gemmlowp::SaturatingAdd( 1578 gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation), 1579 prev_state_times_forget_state); 1580 // Implementation of last internal Tanh node, still in fixed-point. 1581 // Since a Tanh fixed-point implementation is specialized for a given 1582 // number or integer bits, and each specialization can have a substantial 1583 // code size, and we already used above a Tanh on an input with 3 integer 1584 // bits, and per the table in the above function comment there is no 1585 // significant accuracy to be lost by clamping to [-8, +8] for a 1586 // 3-integer-bits representation, let us just do that. This helps people 1587 // porting this to targets where code footprint must be minimized. 1588 F3 new_state_f3 = gemmlowp::Rescale<3>(new_state); 1589 F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3); 1590 // Store the new internal state back to memory, as 16-bit integers. 1591 // Note: here we store the original value with StateIntegerBits, not 1592 // the rescaled 3-integer-bits value fed to tanh. 1593 output_state_data_int16[b * output_depth + c] = new_state.raw(); 1594 // Down-scale the output activations to 8-bit integers, saturating, 1595 // and store back to memory. 1596 int16 rescaled_output_activ = 1597 gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8); 1598 int16 clamped_output_activ = 1599 std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ)); 1600 output_activ_data_uint8[b * output_depth + c] = 1601 128 + clamped_output_activ; 1602 } 1603 } 1604 } 1605 1606 template <typename Scalar> 1607 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, 1608 int axis, int outputs_count, Scalar* const* output_data, 1609 const Dims<4>* const* output_dims) { 1610 const int batches = ArraySize(*output_dims[0], 3); 1611 const int height = ArraySize(*output_dims[0], 2); 1612 const int width = ArraySize(*output_dims[0], 1); 1613 const int depth = ArraySize(*output_dims[0], 0); 1614 1615 const int slice_size = ArraySize(*output_dims[0], axis); 1616 1617 for (int i = 0; i < outputs_count; ++i) { 1618 int offset = i * slice_size * input_dims.strides[axis]; 1619 for (int b = 0; b < batches; ++b) { 1620 for (int y = 0; y < height; ++y) { 1621 for (int x = 0; x < width; ++x) { 1622 for (int c = 0; c < depth; ++c) { 1623 auto out = Offset(*output_dims[i], c, x, y, b); 1624 auto in = Offset(input_dims, c, x, y, b); 1625 output_data[i][out] = input_data[offset + in]; 1626 } 1627 } 1628 } 1629 } 1630 } 1631 } 1632 1633 template <FusedActivationFunctionType Ac, typename Scalar> 1634 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, 1635 int outputs_count, Scalar* const* output_data, 1636 const Dims<4>* const* output_dims) { 1637 TFLITE_DCHECK_GE(outputs_count, 1); 1638 for (int i = 0; i < outputs_count; i++) { 1639 /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); 1640 /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); 1641 /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); 1642 } 1643 // for now we dont have a model with a TensorFlowSplit 1644 // with fused activation function. 1645 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1646 1647 TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count, 1648 output_data, output_dims); 1649 } 1650 1651 // TODO(benoitjacob) make this a proper reference impl without Eigen! 1652 template <typename Scalar> 1653 using MatrixMap = typename std::conditional< 1654 std::is_const<Scalar>::value, 1655 Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, 1656 Eigen::Dynamic, Eigen::Dynamic>>, 1657 Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; 1658 1659 template <typename Scalar, int N> 1660 MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, 1661 const Dims<N>& dims) { 1662 const int rows = dims.sizes[0]; 1663 int cols = 1; 1664 for (int d = 1; d < N; d++) { 1665 cols *= dims.sizes[d]; 1666 } 1667 return MatrixMap<Scalar>(data, rows, cols); 1668 } 1669 1670 template <typename Scalar, int N> 1671 MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, 1672 const Dims<N>& dims) { 1673 const int cols = dims.sizes[N - 1]; 1674 int rows = 1; 1675 for (int d = 0; d < N - 1; d++) { 1676 rows *= dims.sizes[d]; 1677 } 1678 return MatrixMap<Scalar>(data, rows, cols); 1679 } 1680 1681 inline int NodeOffset(int b, int h, int w, int height, int width) { 1682 return (b * height + h) * width + w; 1683 } 1684 1685 inline void AveragePool(const float* input_data, const Dims<4>& input_dims, 1686 int stride_width, int stride_height, int pad_width, 1687 int pad_height, int filter_width, int filter_height, 1688 float output_activation_min, 1689 float output_activation_max, float* output_data, 1690 const Dims<4>& output_dims) { 1691 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1692 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1693 const int input_height = ArraySize(input_dims, 2); 1694 const int input_width = ArraySize(input_dims, 1); 1695 const int output_height = ArraySize(output_dims, 2); 1696 const int output_width = ArraySize(output_dims, 1); 1697 for (int batch = 0; batch < batches; ++batch) { 1698 for (int out_y = 0; out_y < output_height; ++out_y) { 1699 for (int out_x = 0; out_x < output_width; ++out_x) { 1700 for (int channel = 0; channel < depth; ++channel) { 1701 const int in_x_origin = (out_x * stride_width) - pad_width; 1702 const int in_y_origin = (out_y * stride_height) - pad_height; 1703 // Compute the boundaries of the filter region clamped so as to 1704 // ensure that the filter window fits in the input array. 1705 const int filter_x_start = std::max(0, -in_x_origin); 1706 const int filter_x_end = 1707 std::min(filter_width, input_width - in_x_origin); 1708 const int filter_y_start = std::max(0, -in_y_origin); 1709 const int filter_y_end = 1710 std::min(filter_height, input_height - in_y_origin); 1711 float total = 0.f; 1712 float filter_count = 0; 1713 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1714 ++filter_y) { 1715 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1716 ++filter_x) { 1717 const int in_x = in_x_origin + filter_x; 1718 const int in_y = in_y_origin + filter_y; 1719 total += 1720 input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1721 filter_count++; 1722 } 1723 } 1724 const float average = total / filter_count; 1725 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1726 ActivationFunctionWithMinMax(average, output_activation_min, 1727 output_activation_max); 1728 } 1729 } 1730 } 1731 } 1732 } 1733 1734 // legacy, for compatibility with old checked-in code 1735 template <FusedActivationFunctionType Ac> 1736 void AveragePool(const float* input_data, const Dims<4>& input_dims, 1737 int stride_width, int stride_height, int pad_width, 1738 int pad_height, int filter_width, int filter_height, 1739 float* output_data, const Dims<4>& output_dims) { 1740 float output_activation_min, output_activation_max; 1741 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1742 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 1743 pad_height, filter_width, filter_height, output_activation_min, 1744 output_activation_max, output_data, output_dims); 1745 } 1746 1747 // legacy, for compatibility with old checked-in code 1748 template <FusedActivationFunctionType Ac> 1749 void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, 1750 int pad_width, int pad_height, int filter_width, 1751 int filter_height, float* output_data, 1752 const Dims<4>& output_dims) { 1753 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1754 filter_width, filter_height, output_data, output_dims); 1755 } 1756 1757 inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 1758 int stride_width, int stride_height, int pad_width, 1759 int pad_height, int filter_width, int filter_height, 1760 int32 output_activation_min, 1761 int32 output_activation_max, uint8* output_data, 1762 const Dims<4>& output_dims) { 1763 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1764 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1765 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1766 const int input_height = ArraySize(input_dims, 2); 1767 const int input_width = ArraySize(input_dims, 1); 1768 const int output_height = ArraySize(output_dims, 2); 1769 const int output_width = ArraySize(output_dims, 1); 1770 for (int batch = 0; batch < batches; ++batch) { 1771 for (int out_y = 0; out_y < output_height; ++out_y) { 1772 for (int out_x = 0; out_x < output_width; ++out_x) { 1773 for (int channel = 0; channel < depth; ++channel) { 1774 const int in_x_origin = (out_x * stride_width) - pad_width; 1775 const int in_y_origin = (out_y * stride_height) - pad_height; 1776 // Compute the boundaries of the filter region clamped so as to 1777 // ensure that the filter window fits in the input array. 1778 const int filter_x_start = std::max(0, -in_x_origin); 1779 const int filter_x_end = 1780 std::min(filter_width, input_width - in_x_origin); 1781 const int filter_y_start = std::max(0, -in_y_origin); 1782 const int filter_y_end = 1783 std::min(filter_height, input_height - in_y_origin); 1784 int32 acc = 0; 1785 int filter_count = 0; 1786 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1787 ++filter_y) { 1788 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1789 ++filter_x) { 1790 const int in_x = in_x_origin + filter_x; 1791 const int in_y = in_y_origin + filter_y; 1792 acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1793 filter_count++; 1794 } 1795 } 1796 acc = (acc + filter_count / 2) / filter_count; 1797 acc = std::max(acc, output_activation_min); 1798 acc = std::min(acc, output_activation_max); 1799 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1800 static_cast<uint8>(acc); 1801 } 1802 } 1803 } 1804 } 1805 } 1806 1807 // legacy, for compatibility with old checked-in code 1808 template <FusedActivationFunctionType Ac> 1809 void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 1810 int stride_width, int stride_height, int pad_width, 1811 int pad_height, int filter_width, int filter_height, 1812 int32 output_activation_min, int32 output_activation_max, 1813 uint8* output_data, const Dims<4>& output_dims) { 1814 static_assert(Ac == FusedActivationFunctionType::kNone || 1815 Ac == FusedActivationFunctionType::kRelu || 1816 Ac == FusedActivationFunctionType::kRelu6 || 1817 Ac == FusedActivationFunctionType::kRelu1, 1818 ""); 1819 if (Ac == FusedActivationFunctionType::kNone) { 1820 TFLITE_DCHECK_EQ(output_activation_min, 0); 1821 TFLITE_DCHECK_EQ(output_activation_max, 255); 1822 } 1823 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 1824 pad_height, filter_width, filter_height, output_activation_min, 1825 output_activation_max, output_data, output_dims); 1826 } 1827 1828 // legacy, for compatibility with old checked-in code 1829 template <FusedActivationFunctionType Ac> 1830 void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, 1831 int pad_width, int pad_height, int filter_width, 1832 int filter_height, int32 output_activation_min, 1833 int32 output_activation_max, uint8* output_data, 1834 const Dims<4>& output_dims) { 1835 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1836 filter_width, filter_height, output_activation_min, 1837 output_activation_max, output_data, output_dims); 1838 } 1839 1840 inline void L2Pool(const float* input_data, const Dims<4>& input_dims, 1841 int stride_width, int stride_height, int pad_width, 1842 int pad_height, int filter_width, int filter_height, 1843 float output_activation_min, float output_activation_max, 1844 float* output_data, const Dims<4>& output_dims) { 1845 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1846 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1847 const int input_height = ArraySize(input_dims, 2); 1848 const int input_width = ArraySize(input_dims, 1); 1849 const int output_height = ArraySize(output_dims, 2); 1850 const int output_width = ArraySize(output_dims, 1); 1851 for (int batch = 0; batch < batches; ++batch) { 1852 for (int out_y = 0; out_y < output_height; ++out_y) { 1853 for (int out_x = 0; out_x < output_width; ++out_x) { 1854 for (int channel = 0; channel < depth; ++channel) { 1855 const int in_x_origin = (out_x * stride_width) - pad_width; 1856 const int in_y_origin = (out_y * stride_height) - pad_height; 1857 // Compute the boundaries of the filter region clamped so as to 1858 // ensure that the filter window fits in the input array. 1859 const int filter_x_start = std::max(0, -in_x_origin); 1860 const int filter_x_end = 1861 std::min(filter_width, input_width - in_x_origin); 1862 const int filter_y_start = std::max(0, -in_y_origin); 1863 const int filter_y_end = 1864 std::min(filter_height, input_height - in_y_origin); 1865 float sum_squares = 0.f; 1866 int filter_count = 0; 1867 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1868 ++filter_y) { 1869 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1870 ++filter_x) { 1871 const int in_x = in_x_origin + filter_x; 1872 const int in_y = in_y_origin + filter_y; 1873 const float val = 1874 input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1875 sum_squares += val * val; 1876 filter_count++; 1877 } 1878 } 1879 const float l2pool_result = std::sqrt(sum_squares / filter_count); 1880 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1881 ActivationFunctionWithMinMax(l2pool_result, output_activation_min, 1882 output_activation_max); 1883 } 1884 } 1885 } 1886 } 1887 } 1888 1889 // legacy, for compatibility with old checked-in code 1890 template <FusedActivationFunctionType Ac> 1891 void L2Pool(const float* input_data, const Dims<4>& input_dims, 1892 int stride_width, int stride_height, int pad_width, int pad_height, 1893 int filter_width, int filter_height, float* output_data, 1894 const Dims<4>& output_dims) { 1895 float output_activation_min, output_activation_max; 1896 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1897 1898 L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, 1899 pad_height, filter_width, filter_height, output_activation_min, 1900 output_activation_max, output_data, output_dims); 1901 } 1902 1903 // legacy, for compatibility with old checked-in code 1904 template <FusedActivationFunctionType Ac> 1905 void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, 1906 int pad_width, int pad_height, int filter_width, int filter_height, 1907 float* output_data, const Dims<4>& output_dims) { 1908 L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1909 filter_width, filter_height, output_data, output_dims); 1910 } 1911 1912 inline void MaxPool(const float* input_data, const Dims<4>& input_dims, 1913 int stride_width, int stride_height, int pad_width, 1914 int pad_height, int filter_width, int filter_height, 1915 float output_activation_min, float output_activation_max, 1916 float* output_data, const Dims<4>& output_dims) { 1917 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1918 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1919 const int input_height = ArraySize(input_dims, 2); 1920 const int input_width = ArraySize(input_dims, 1); 1921 const int output_height = ArraySize(output_dims, 2); 1922 const int output_width = ArraySize(output_dims, 1); 1923 for (int batch = 0; batch < batches; ++batch) { 1924 for (int out_y = 0; out_y < output_height; ++out_y) { 1925 for (int out_x = 0; out_x < output_width; ++out_x) { 1926 for (int channel = 0; channel < depth; ++channel) { 1927 const int in_x_origin = (out_x * stride_width) - pad_width; 1928 const int in_y_origin = (out_y * stride_height) - pad_height; 1929 // Compute the boundaries of the filter region clamped so as to 1930 // ensure that the filter window fits in the input array. 1931 const int filter_x_start = std::max(0, -in_x_origin); 1932 const int filter_x_end = 1933 std::min(filter_width, input_width - in_x_origin); 1934 const int filter_y_start = std::max(0, -in_y_origin); 1935 const int filter_y_end = 1936 std::min(filter_height, input_height - in_y_origin); 1937 float max = std::numeric_limits<float>::lowest(); 1938 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1939 ++filter_y) { 1940 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1941 ++filter_x) { 1942 const int in_x = in_x_origin + filter_x; 1943 const int in_y = in_y_origin + filter_y; 1944 max = std::max( 1945 max, 1946 input_data[Offset(input_dims, channel, in_x, in_y, batch)]); 1947 } 1948 } 1949 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1950 ActivationFunctionWithMinMax(max, output_activation_min, 1951 output_activation_max); 1952 } 1953 } 1954 } 1955 } 1956 } 1957 1958 // legacy, for compatibility with old checked-in code 1959 template <FusedActivationFunctionType Ac> 1960 void MaxPool(const float* input_data, const Dims<4>& input_dims, 1961 int stride_width, int stride_height, int pad_width, int pad_height, 1962 int filter_width, int filter_height, float* output_data, 1963 const Dims<4>& output_dims) { 1964 float output_activation_min, output_activation_max; 1965 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1966 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 1967 pad_height, filter_width, filter_height, output_activation_min, 1968 output_activation_max, output_data, output_dims); 1969 } 1970 1971 // legacy, for compatibility with old checked-in code 1972 template <FusedActivationFunctionType Ac> 1973 void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, 1974 int pad_width, int pad_height, int filter_width, int filter_height, 1975 float* output_data, const Dims<4>& output_dims) { 1976 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1977 filter_width, filter_height, output_data, output_dims); 1978 } 1979 1980 inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 1981 int stride_width, int stride_height, int pad_width, 1982 int pad_height, int filter_width, int filter_height, 1983 int32 output_activation_min, int32 output_activation_max, 1984 uint8* output_data, const Dims<4>& output_dims) { 1985 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1986 TFLITE_DCHECK_GE(output_activation_min, 0); 1987 TFLITE_DCHECK_LE(output_activation_max, 255); 1988 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1989 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1990 const int input_height = ArraySize(input_dims, 2); 1991 const int input_width = ArraySize(input_dims, 1); 1992 const int output_height = ArraySize(output_dims, 2); 1993 const int output_width = ArraySize(output_dims, 1); 1994 for (int batch = 0; batch < batches; ++batch) { 1995 for (int out_y = 0; out_y < output_height; ++out_y) { 1996 for (int out_x = 0; out_x < output_width; ++out_x) { 1997 for (int channel = 0; channel < depth; ++channel) { 1998 const int in_x_origin = (out_x * stride_width) - pad_width; 1999 const int in_y_origin = (out_y * stride_height) - pad_height; 2000 // Compute the boundaries of the filter region clamped so as to 2001 // ensure that the filter window fits in the input array. 2002 const int filter_x_start = std::max(0, -in_x_origin); 2003 const int filter_x_end = 2004 std::min(filter_width, input_width - in_x_origin); 2005 const int filter_y_start = std::max(0, -in_y_origin); 2006 const int filter_y_end = 2007 std::min(filter_height, input_height - in_y_origin); 2008 uint8 max = 0; 2009 for (int filter_y = filter_y_start; filter_y < filter_y_end; 2010 ++filter_y) { 2011 for (int filter_x = filter_x_start; filter_x < filter_x_end; 2012 ++filter_x) { 2013 const int in_x = in_x_origin + filter_x; 2014 const int in_y = in_y_origin + filter_y; 2015 max = std::max( 2016 max, 2017 input_data[Offset(input_dims, channel, in_x, in_y, batch)]); 2018 } 2019 } 2020 max = std::max<uint8>(max, output_activation_min); 2021 max = std::min<uint8>(max, output_activation_max); 2022 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 2023 static_cast<uint8>(max); 2024 } 2025 } 2026 } 2027 } 2028 } 2029 2030 // legacy, for compatibility with old checked-in code 2031 template <FusedActivationFunctionType Ac> 2032 void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 2033 int stride_width, int stride_height, int pad_width, int pad_height, 2034 int filter_width, int filter_height, int32 output_activation_min, 2035 int32 output_activation_max, uint8* output_data, 2036 const Dims<4>& output_dims) { 2037 static_assert(Ac == FusedActivationFunctionType::kNone || 2038 Ac == FusedActivationFunctionType::kRelu || 2039 Ac == FusedActivationFunctionType::kRelu6 || 2040 Ac == FusedActivationFunctionType::kRelu1, 2041 ""); 2042 if (Ac == FusedActivationFunctionType::kNone) { 2043 TFLITE_DCHECK_EQ(output_activation_min, 0); 2044 TFLITE_DCHECK_EQ(output_activation_max, 255); 2045 } 2046 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 2047 pad_height, filter_width, filter_height, output_activation_min, 2048 output_activation_max, output_data, output_dims); 2049 } 2050 2051 // legacy, for compatibility with old checked-in code 2052 template <FusedActivationFunctionType Ac> 2053 void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, 2054 int pad_width, int pad_height, int filter_width, int filter_height, 2055 int32 output_activation_min, int32 output_activation_max, 2056 uint8* output_data, const Dims<4>& output_dims) { 2057 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 2058 filter_width, filter_height, output_activation_min, 2059 output_activation_max, output_data, output_dims); 2060 } 2061 2062 inline void LocalResponseNormalization(const float* input_data, 2063 const Dims<4>& input_dims, int range, 2064 float bias, float alpha, float beta, 2065 float* output_data, 2066 const Dims<4>& output_dims) { 2067 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2068 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2069 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2070 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2071 2072 for (int b = 0; b < batches; ++b) { 2073 for (int y = 0; y < height; ++y) { 2074 for (int x = 0; x < width; ++x) { 2075 for (int c = 0; c < depth; ++c) { 2076 const int begin_input_c = std::max(0, c - range); 2077 const int end_input_c = std::min(depth, c + range); 2078 float accum = 0.f; 2079 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { 2080 const float input_val = 2081 input_data[Offset(input_dims, input_c, x, y, b)]; 2082 accum += input_val * input_val; 2083 } 2084 const float multiplier = std::pow(bias + alpha * accum, -beta); 2085 output_data[Offset(output_dims, c, x, y, b)] = 2086 input_data[Offset(input_dims, c, x, y, b)] * multiplier; 2087 } 2088 } 2089 } 2090 } 2091 } 2092 2093 inline void Softmax(const float* input_data, const Dims<4>& input_dims, 2094 float beta, float* output_data, 2095 const Dims<4>& output_dims) { 2096 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2097 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2098 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2099 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2100 2101 for (int b = 0; b < batches; ++b) { 2102 for (int y = 0; y < height; ++y) { 2103 for (int x = 0; x < width; ++x) { 2104 // Find max element value which we'll use to ensure numerical stability 2105 // taking advantage of the following equality: 2106 // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) 2107 float max = std::numeric_limits<float>::lowest(); 2108 for (int c = 0; c < depth; ++c) { 2109 max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]); 2110 } 2111 2112 // Compute sum. 2113 float sum = 0.f; 2114 for (int c = 0; c < depth; ++c) { 2115 sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) * 2116 beta); 2117 } 2118 2119 // Compute result. 2120 for (int c = 0; c < depth; ++c) { 2121 output_data[Offset(output_dims, c, x, y, b)] = 2122 std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) * 2123 beta) / 2124 sum; 2125 } 2126 } 2127 } 2128 } 2129 } 2130 2131 inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, 2132 int32 input_beta_multiplier, int32 input_beta_left_shift, 2133 int diff_min, uint8* output_data, 2134 const Dims<4>& output_dims) { 2135 // The representation chosen for the input to the exp() function is Q5.26. 2136 // We need to leave extra space since values that we skip might be as large as 2137 // -32 before multiplying by input_beta_multiplier, and therefore as large as 2138 // -16 afterwards. Note that exp(-8) is definitely not insignificant to 2139 // accumulation, but exp(-16) definitely is. 2140 static const int kScaledDiffIntegerBits = 5; 2141 static const int kAccumulationIntegerBits = 12; 2142 using FixedPointScaledDiff = 2143 gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; 2144 using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; 2145 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 2146 2147 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2148 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2149 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2150 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2151 2152 for (int b = 0; b < batches; ++b) { 2153 for (int x = 0; x < width; ++x) { 2154 for (int y = 0; y < height; ++y) { 2155 uint8 max_in_row = 0; 2156 for (int c = 0; c < depth; ++c) { 2157 max_in_row = 2158 std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]); 2159 } 2160 2161 FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); 2162 for (int c = 0; c < depth; ++c) { 2163 int32 input_diff = 2164 static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - 2165 max_in_row; 2166 if (input_diff >= diff_min) { 2167 const int32 input_diff_rescaled = 2168 MultiplyByQuantizedMultiplierGreaterThanOne( 2169 input_diff, input_beta_multiplier, input_beta_left_shift); 2170 const FixedPointScaledDiff scaled_diff_f8 = 2171 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 2172 sum_of_exps = 2173 sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( 2174 exp_on_negative_values(scaled_diff_f8)); 2175 } 2176 } 2177 2178 int32 fixed_sum_of_exps = sum_of_exps.raw(); 2179 int headroom_plus_one = 2180 CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); 2181 // This is the number of bits to the left of the binary point above 1.0. 2182 // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and 2183 // no later adjustment will be needed. 2184 int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; 2185 int32 shifted_sum_minus_one = static_cast<int32>( 2186 (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - 2187 (static_cast<uint32>(1) << 31)); 2188 2189 FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( 2190 FixedPoint0::FromRaw(shifted_sum_minus_one)); 2191 2192 for (int c = 0; c < depth; ++c) { 2193 int32 input_diff = 2194 static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - 2195 max_in_row; 2196 if (input_diff >= diff_min) { 2197 const int32 input_diff_rescaled = 2198 MultiplyByQuantizedMultiplierGreaterThanOne( 2199 input_diff, input_beta_multiplier, input_beta_left_shift); 2200 const FixedPointScaledDiff scaled_diff_f8 = 2201 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 2202 2203 FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); 2204 int32 unsat_output = gemmlowp::RoundingDivideByPOT( 2205 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); 2206 2207 output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>( 2208 std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); 2209 2210 } else { 2211 output_data[Offset(output_dims, c, x, y, b)] = 0; 2212 } 2213 } 2214 } 2215 } 2216 } 2217 } 2218 2219 inline void Logistic(const float* input_data, const Dims<4>& input_dims, 2220 float* output_data, const Dims<4>& output_dims) { 2221 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2222 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2223 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2224 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2225 for (int b = 0; b < batches; ++b) { 2226 for (int y = 0; y < height; ++y) { 2227 for (int x = 0; x < width; ++x) { 2228 for (int c = 0; c < depth; ++c) { 2229 float val = input_data[Offset(input_dims, c, x, y, b)]; 2230 float result = 1.f / (1.f + std::exp(-val)); 2231 output_data[Offset(output_dims, c, x, y, b)] = result; 2232 } 2233 } 2234 } 2235 } 2236 } 2237 2238 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, 2239 int32 input_zero_point, int32 input_range_radius, 2240 int32 input_multiplier, int input_left_shift, 2241 uint8* output_data, const Dims<4>& output_dims) { 2242 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2243 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2244 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2245 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2246 for (int b = 0; b < batches; ++b) { 2247 for (int y = 0; y < height; ++y) { 2248 for (int x = 0; x < width; ++x) { 2249 for (int c = 0; c < depth; ++c) { 2250 const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)]; 2251 const int32 input_val_centered = 2252 static_cast<int32>(input_val_u8) - input_zero_point; 2253 uint8 output_val; 2254 if (input_val_centered <= -input_range_radius) { 2255 output_val = 0; 2256 } else if (input_val_centered >= input_range_radius) { 2257 output_val = 255; 2258 } else { 2259 const int32 input_val_rescaled = 2260 MultiplyByQuantizedMultiplierGreaterThanOne( 2261 input_val_centered, input_multiplier, input_left_shift); 2262 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; 2263 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 2264 const FixedPoint4 input_val_f4 = 2265 FixedPoint4::FromRaw(input_val_rescaled); 2266 const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); 2267 using gemmlowp::RoundingDivideByPOT; 2268 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); 2269 if (output_val_s32 == 256) { 2270 output_val_s32 = 255; 2271 } 2272 TFLITE_DCHECK_GE(output_val_s32, 0); 2273 TFLITE_DCHECK_LE(output_val_s32, 255); 2274 output_val = static_cast<uint8>(output_val_s32); 2275 } 2276 output_data[Offset(output_dims, c, x, y, b)] = output_val; 2277 } 2278 } 2279 } 2280 } 2281 } 2282 2283 inline void Tanh(const float* input_data, const Dims<4>& input_dims, 2284 float* output_data, const Dims<4>& output_dims) { 2285 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2286 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2287 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2288 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2289 for (int b = 0; b < batches; ++b) { 2290 for (int y = 0; y < height; ++y) { 2291 for (int x = 0; x < width; ++x) { 2292 for (int c = 0; c < depth; ++c) { 2293 float val = input_data[Offset(input_dims, c, x, y, b)]; 2294 float result = std::tanh(val); 2295 output_data[Offset(output_dims, c, x, y, b)] = result; 2296 } 2297 } 2298 } 2299 } 2300 } 2301 2302 inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, 2303 int32 input_zero_point, int32 input_range_radius, 2304 int32 input_multiplier, int input_left_shift, 2305 uint8* output_data, const Dims<4>& output_dims) { 2306 const int32 output_zero_point = 128; 2307 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2308 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2309 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2310 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2311 for (int b = 0; b < batches; ++b) { 2312 for (int y = 0; y < height; ++y) { 2313 for (int x = 0; x < width; ++x) { 2314 for (int c = 0; c < depth; ++c) { 2315 const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)]; 2316 const int32 input_val_centered = 2317 static_cast<int32>(input_val_u8) - input_zero_point; 2318 uint8 output_val; 2319 if (input_val_centered <= -input_range_radius) { 2320 output_val = 0; 2321 } else if (input_val_centered >= input_range_radius) { 2322 output_val = 255; 2323 } else { 2324 const int32 input_val_rescaled = 2325 MultiplyByQuantizedMultiplierGreaterThanOne( 2326 input_val_centered, input_multiplier, input_left_shift); 2327 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; 2328 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 2329 const FixedPoint4 input_val_f4 = 2330 FixedPoint4::FromRaw(input_val_rescaled); 2331 const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4); 2332 2333 using gemmlowp::RoundingDivideByPOT; 2334 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); 2335 output_val_s32 += output_zero_point; 2336 if (output_val_s32 == 256) { 2337 output_val_s32 = 255; 2338 } 2339 TFLITE_DCHECK_GE(output_val_s32, 0); 2340 TFLITE_DCHECK_LE(output_val_s32, 255); 2341 output_val = static_cast<uint8>(output_val_s32); 2342 } 2343 output_data[Offset(output_dims, c, x, y, b)] = output_val; 2344 } 2345 } 2346 } 2347 } 2348 } 2349 2350 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, 2351 int32 zero_point, double scale, float* output_data, 2352 const Dims<4>& output_dims) { 2353 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2354 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2355 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2356 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2357 for (int b = 0; b < batches; ++b) { 2358 for (int y = 0; y < height; ++y) { 2359 for (int x = 0; x < width; ++x) { 2360 for (int c = 0; c < depth; ++c) { 2361 int32 val = input_data[Offset(input_dims, c, x, y, b)]; 2362 float result = static_cast<float>(scale * (val - zero_point)); 2363 output_data[Offset(output_dims, c, x, y, b)] = result; 2364 } 2365 } 2366 } 2367 } 2368 } 2369 2370 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, 2371 float rmin, float rmax, float* output_data, 2372 const Dims<4>& output_dims) { 2373 // 0 should always be a representable value. Let's assume that the initial 2374 // min,max range contains 0. 2375 TFLITE_DCHECK_LE(rmin, 0.); 2376 TFLITE_DCHECK_GE(rmax, 0.); 2377 2378 // Determine quantization parameters: zero_point, scale. 2379 using Integer = uint8; 2380 const Integer qmin = std::numeric_limits<Integer>::min(); 2381 const Integer qmax = std::numeric_limits<Integer>::max(); 2382 const float qmin_float = qmin; 2383 const float qmax_float = qmax; 2384 int32 zero_point = 0; 2385 float scale = 0.f; 2386 // If rmin==rmax, both must be zero per the above assertion, 2387 // so we are done. 2388 if (rmin != rmax) { 2389 // First determine the scale. 2390 scale = (rmax - rmin) / (qmax_float - qmin_float); 2391 2392 // Zero-point computation. 2393 // First the initial floating-point computation. The zero-point can be 2394 // determined from solving an affine equation for any known pair 2395 // (real value, corresponding quantized value). 2396 // We know two such pairs: (rmin, qmin) and (rmax, qmax). 2397 // The arithmetic error on the zero point computed from either pair 2398 // will be roughly machine_epsilon * (sum of absolute values of terms) 2399 // so we want to use the variant that adds the smaller terms. 2400 const float zero_point_from_min = qmin_float - rmin / scale; 2401 const float zero_point_from_max = qmax_float - rmax / scale; 2402 const float zero_point_from_min_error = 2403 std::abs(qmin_float) + std::abs(rmin / scale); 2404 const float zero_point_from_max_error = 2405 std::abs(qmax_float) + std::abs(rmax / scale); 2406 2407 const float zero_point_float = 2408 zero_point_from_min_error < zero_point_from_max_error 2409 ? zero_point_from_min 2410 : zero_point_from_max; 2411 2412 // Now we need to nudge the zero point to be an integer 2413 // (our zero points are integer, and this is motivated by the requirement 2414 // to be able to represent the real value "0" exactly as a quantized value, 2415 // which is required in multiple places, for example in Im2col with SAME 2416 // padding). 2417 if (zero_point_float < qmin_float) { 2418 zero_point = qmin; 2419 } else if (zero_point_float > qmax_float) { 2420 zero_point = qmax; 2421 } else { 2422 zero_point = static_cast<int32>(TfLiteRound(zero_point_float)); 2423 } 2424 // The zero point should always be in the range of quantized value, 2425 // [qmin, qmax]. 2426 TFLITE_DCHECK_GE(zero_point, qmin); 2427 TFLITE_DCHECK_LE(zero_point, qmax); 2428 } 2429 2430 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2431 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2432 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2433 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2434 for (int b = 0; b < batches; ++b) { 2435 for (int y = 0; y < height; ++y) { 2436 for (int x = 0; x < width; ++x) { 2437 for (int c = 0; c < depth; ++c) { 2438 const float src_val = input_data[Offset(input_dims, c, x, y, b)]; 2439 const float unclamped_quantized_val = 2440 TfLiteRound(zero_point + src_val / scale); 2441 const float quantized_val = std::min( 2442 qmax_float, std::max(qmin_float, unclamped_quantized_val)); 2443 const float dst_val = scale * (quantized_val - zero_point); 2444 output_data[Offset(output_dims, c, x, y, b)] = dst_val; 2445 } 2446 } 2447 } 2448 } 2449 } 2450 2451 template <typename SrcT, typename DstT> 2452 inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, 2453 DstT* output_data, const Dims<4>& output_dims) { 2454 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2455 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2456 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2457 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2458 for (int b = 0; b < batches; ++b) { 2459 for (int y = 0; y < height; ++y) { 2460 for (int x = 0; x < width; ++x) { 2461 for (int c = 0; c < depth; ++c) { 2462 int offset = Offset(input_dims, c, x, y, b); 2463 output_data[offset] = static_cast<DstT>(input_data[offset]); 2464 } 2465 } 2466 } 2467 } 2468 } 2469 2470 inline void Floor(const float* input_data, const Dims<4>& input_dims, 2471 float* output_data, const Dims<4>& output_dims) { 2472 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2473 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2474 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2475 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2476 for (int b = 0; b < batches; ++b) { 2477 for (int y = 0; y < height; ++y) { 2478 for (int x = 0; x < width; ++x) { 2479 for (int c = 0; c < depth; ++c) { 2480 int offset = Offset(input_dims, c, x, y, b); 2481 output_data[offset] = std::floor(input_data[offset]); 2482 } 2483 } 2484 } 2485 } 2486 } 2487 2488 template <typename T> 2489 inline void Gather(const T* input_data, const Dims<4>& input_dims, 2490 int input_rank, const int32* coords_data, 2491 const Dims<4>& coords_dims, T* output_data, 2492 const Dims<4>& output_dims) { 2493 TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]); 2494 int stride = input_dims.strides[input_rank - 1]; 2495 T* out = output_data; 2496 2497 for (int i = 0; i < coords_dims.sizes[0]; i++) { 2498 TFLITE_DCHECK_GE(coords_data[i], 0); 2499 TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]); 2500 const T* in = input_data + coords_data[i] * stride; 2501 memcpy(out, in, sizeof(T) * stride); 2502 out += stride; 2503 } 2504 } 2505 2506 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, 2507 const int32* output_size_data, 2508 const Dims<4>& output_size_dims, float* output_data, 2509 const Dims<4>& output_dims, bool align_corners) { 2510 int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2511 int32 input_height = ArraySize(input_dims, 2); 2512 int32 input_width = ArraySize(input_dims, 1); 2513 int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2514 2515 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1); 2516 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1); 2517 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1); 2518 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2); 2519 int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; 2520 int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; 2521 float height_scale = static_cast<float>(input_height) / output_height; 2522 float width_scale = static_cast<float>(input_width) / output_width; 2523 if (align_corners && output_height > 1) { 2524 height_scale = static_cast<float>(input_height - 1) / (output_height - 1); 2525 } 2526 if (align_corners && output_width > 1) { 2527 width_scale = static_cast<float>(input_width - 1) / (output_width - 1); 2528 } 2529 2530 for (int b = 0; b < batches; ++b) { 2531 for (int y = 0; y < output_height; ++y) { 2532 float input_y = y * height_scale; 2533 int32 y0 = static_cast<int32>(std::floor(input_y)); 2534 int32 y1 = std::min(y0 + 1, input_height - 1); 2535 for (int x = 0; x < output_width; ++x) { 2536 float input_x = x * width_scale; 2537 int32 x0 = static_cast<int32>(std::floor(input_x)); 2538 int32 x1 = std::min(x0 + 1, input_width - 1); 2539 for (int c = 0; c < depth; ++c) { 2540 float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] * 2541 (1 - (input_y - y0)) * 2542 (1 - (input_x - x0)) + 2543 input_data[Offset(input_dims, c, x0, y1, b)] * 2544 (input_y - y0) * (1 - (input_x - x0)) + 2545 input_data[Offset(input_dims, c, x1, y0, b)] * 2546 (1 - (input_y - y0)) * (input_x - x0) + 2547 input_data[Offset(input_dims, c, x1, y1, b)] * 2548 (input_y - y0) * (input_x - x0); 2549 output_data[Offset(output_dims, c, x, y, b)] = interpolation; 2550 } 2551 } 2552 } 2553 } 2554 } 2555 2556 // legacy, for compatibility with old checked-in code 2557 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, 2558 const int32* output_size_data, 2559 const Dims<4>& output_size_dims, float* output_data, 2560 const Dims<4>& output_dims) { 2561 ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, 2562 output_data, output_dims, /*align_corners=*/false); 2563 } 2564 2565 template <typename T> 2566 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, 2567 const int32* block_shape_data, 2568 const Dims<4>& block_shape_dims, 2569 const int32* paddings_data, 2570 const Dims<4>& paddings_dims, T* output_data, 2571 const Dims<4>& output_dims) { 2572 const int output_batch_size = ArraySize(output_dims, 3); 2573 const int output_height = ArraySize(output_dims, 2); 2574 const int output_width = ArraySize(output_dims, 1); 2575 const int input_batch_size = ArraySize(input_dims, 3); 2576 const int input_height = ArraySize(input_dims, 2); 2577 const int input_width = ArraySize(input_dims, 1); 2578 const int depth = ArraySize(input_dims, 0); 2579 const int block_shape_height = block_shape_data[0]; 2580 const int block_shape_width = block_shape_data[1]; 2581 const int padding_top = paddings_data[0]; 2582 const int padding_left = paddings_data[2]; 2583 2584 for (int out_b = 0; out_b < output_batch_size; ++out_b) { 2585 int input_batch = out_b % input_batch_size; 2586 int shift_w = (out_b / input_batch_size) % block_shape_width; 2587 int shift_h = (out_b / input_batch_size) / block_shape_width; 2588 for (int out_h = 0; out_h < output_height; ++out_h) { 2589 for (int out_w = 0; out_w < output_width; ++out_w) { 2590 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b); 2591 if (out_h * block_shape_height + shift_h < padding_top || 2592 out_h * block_shape_height + shift_h >= 2593 padding_top + input_height || 2594 out_w * block_shape_width + shift_w < padding_left || 2595 out_w * block_shape_width + shift_w >= padding_left + input_width) { 2596 memset(out, 0, depth * sizeof(T)); 2597 } else { 2598 const T* in = 2599 input_data + 2600 Offset(input_dims, 0, 2601 (out_w * block_shape_width + shift_w) - padding_left, 2602 (out_h * block_shape_height + shift_h) - padding_top, 2603 input_batch); 2604 memcpy(out, in, depth * sizeof(T)); 2605 } 2606 } 2607 } 2608 } 2609 } 2610 2611 template <typename T> 2612 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, 2613 const int32* block_shape_data, 2614 const Dims<4>& block_shape_dims, T* output_data, 2615 const Dims<4>& output_dims) { 2616 const int output_batch_size = ArraySize(output_dims, 3); 2617 const int input_batch_size = ArraySize(input_dims, 3); 2618 const int input_height = ArraySize(input_dims, 2); 2619 const int input_width = ArraySize(input_dims, 1); 2620 const int depth = ArraySize(input_dims, 0); 2621 const int block_shape_width = block_shape_data[1]; 2622 const int block_shape_height = block_shape_data[0]; 2623 2624 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) { 2625 for (int in_h = 0; in_h < input_height; ++in_h) { 2626 for (int in_w = 0; in_w < input_width; ++in_w) { 2627 int out_batch = in_batch % output_batch_size; 2628 int out_w = in_w * block_shape_width + 2629 (in_batch / output_batch_size) % block_shape_width; 2630 int out_h = in_h * block_shape_height + 2631 (in_batch / output_batch_size) / block_shape_width; 2632 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch); 2633 const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch); 2634 memcpy(out, in, depth * sizeof(T)); 2635 } 2636 } 2637 } 2638 } 2639 2640 template <typename T> 2641 inline void Pad(const T* input_data, const Dims<4>& input_dims, 2642 const std::vector<int>& left_paddings, 2643 const std::vector<int>& right_paddings, T* output_data, 2644 const Dims<4>& output_dims) { 2645 const int output_batch = ArraySize(output_dims, 3); 2646 const int output_height = ArraySize(output_dims, 2); 2647 const int output_width = ArraySize(output_dims, 1); 2648 const int output_depth = ArraySize(output_dims, 0); 2649 2650 const int left_b_padding = left_paddings[3]; 2651 const int left_h_padding = left_paddings[2]; 2652 const int left_w_padding = left_paddings[1]; 2653 const int left_d_padding = left_paddings[0]; 2654 2655 const int right_b_padding = right_paddings[3]; 2656 const int right_h_padding = right_paddings[2]; 2657 const int right_w_padding = right_paddings[1]; 2658 const int right_d_padding = right_paddings[0]; 2659 2660 const T* in_ptr = input_data; 2661 T* out_ptr = output_data; 2662 for (int out_b = 0; out_b < output_batch; ++out_b) { 2663 for (int out_h = 0; out_h < output_height; ++out_h) { 2664 for (int out_w = 0; out_w < output_width; ++out_w) { 2665 for (int out_d = 0; out_d < output_depth; ++out_d) { 2666 if (out_b < left_b_padding || 2667 out_b >= output_batch - right_b_padding || 2668 out_h < left_h_padding || 2669 out_h >= output_height - right_h_padding || 2670 out_w < left_w_padding || 2671 out_w >= output_width - right_w_padding || 2672 out_d < left_d_padding || 2673 out_d >= output_depth - right_d_padding) { 2674 *out_ptr++ = 0; 2675 } else { 2676 *out_ptr++ = *in_ptr++; 2677 } 2678 } 2679 } 2680 } 2681 } 2682 } 2683 2684 inline bool LoopCondition(int index, int stop, int stride) { 2685 return stride > 0 ? index < stop : index > stop; 2686 } 2687 2688 inline int StartIndex(int start, int stride, int dim, bool masked) { 2689 return masked ? (stride > 0 ? 0 : dim - 1) : start; 2690 } 2691 2692 inline int StopIndex(int start, int stop, int stride, int dim, bool masked, 2693 bool shrink_axis_masked) { 2694 return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1 2695 : masked ? (stride > 0 ? dim : -1) : stop; 2696 } 2697 2698 template <typename T> 2699 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, 2700 int begin_mask, int end_mask, int shrink_axis_mask, 2701 const std::vector<int>& starts, 2702 const std::vector<int>& stops, 2703 const std::vector<int>& strides, T* output_data, 2704 const Dims<4>& output_dims) { 2705 TFLITE_DCHECK_EQ(starts.size(), 4); 2706 TFLITE_DCHECK_EQ(stops.size(), 4); 2707 TFLITE_DCHECK_EQ(strides.size(), 4); 2708 const int start_b = 2709 StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8); 2710 const int stop_b = 2711 StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3], 2712 end_mask & 8, shrink_axis_mask & 8); 2713 const int start_h = 2714 StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4); 2715 const int stop_h = 2716 StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2], 2717 end_mask & 4, shrink_axis_mask & 4); 2718 const int start_w = 2719 StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2); 2720 const int stop_w = 2721 StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1], 2722 end_mask & 2, shrink_axis_mask & 2); 2723 const int start_d = 2724 StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1); 2725 const int stop_d = 2726 StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0], 2727 end_mask & 1, shrink_axis_mask & 1); 2728 2729 T* out_ptr = output_data; 2730 for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]); 2731 in_b += strides[3]) { 2732 for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]); 2733 in_h += strides[2]) { 2734 for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]); 2735 in_w += strides[1]) { 2736 for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]); 2737 in_d += strides[0]) { 2738 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 2739 } 2740 } 2741 } 2742 } 2743 } 2744 2745 template <typename T> 2746 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, 2747 int begin_mask, int end_mask, 2748 const std::vector<int>& starts, 2749 const std::vector<int>& stops, 2750 const std::vector<int>& strides, T* output_data, 2751 const Dims<4>& output_dims) { 2752 StridedSlice(input_data, input_dims, begin_mask, end_mask, 2753 /*shrink_axis_mask=*/0, starts, stops, strides, output_data, 2754 output_dims); 2755 } 2756 2757 template <typename T> 2758 inline void Slice(const T* input_data, const Dims<4>& input_dims, 2759 const std::vector<int>& begin, const std::vector<int>& size, 2760 T* output_data, const Dims<4>& output_dims) { 2761 // TODO(dkalenichenko): This op only supports 4D tensors. 2762 TFLITE_DCHECK_EQ(begin.size(), 4); 2763 TFLITE_DCHECK_EQ(size.size(), 4); 2764 const int start_b = begin[3]; 2765 const int stop_b = 2766 size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3]; 2767 const int start_h = begin[2]; 2768 const int stop_h = 2769 size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2]; 2770 const int start_w = begin[1]; 2771 const int stop_w = 2772 size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1]; 2773 const int start_d = begin[0]; 2774 const int stop_d = 2775 size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0]; 2776 2777 T* out_ptr = output_data; 2778 for (int in_b = start_b; in_b < stop_b; ++in_b) { 2779 for (int in_h = start_h; in_h < stop_h; ++in_h) { 2780 for (int in_w = start_w; in_w < stop_w; ++in_w) { 2781 for (int in_d = start_d; in_d < stop_d; ++in_d) { 2782 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 2783 } 2784 } 2785 } 2786 } 2787 } 2788 2789 template <typename T> 2790 inline void Exp(const T* input_data, const size_t num_elements, 2791 T* output_data) { 2792 for (size_t idx = 0; idx < num_elements; ++idx) { 2793 output_data[idx] = exp(input_data[idx]); 2794 } 2795 } 2796 2797 template <typename T> 2798 inline void Mean(T* input_data, const int* input_dims, const int input_num_dims, 2799 T* output_data, const int* output_dims, 2800 const int output_num_dims, const int* axis, 2801 const int num_axis_dimensions, bool keep_dims, int* temp_index, 2802 int* resolved_axis) { 2803 // resets output data. 2804 size_t num_outputs = 1; 2805 for (int idx = 0; idx < output_num_dims; ++idx) { 2806 num_outputs *= static_cast<size_t>(output_dims[idx]); 2807 } 2808 for (size_t idx = 0; idx < num_outputs; ++idx) { 2809 output_data[idx] = 0; 2810 } 2811 // resets temp index. 2812 for (int idx = 0; idx < input_num_dims; ++idx) { 2813 temp_index[idx] = 0; 2814 } 2815 // resolves axis. 2816 int num_resolved_axis = 0; 2817 for (int idx = 0; idx < num_axis_dimensions; ++idx) { 2818 int current = axis[idx]; 2819 TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0); 2820 if (current < 0) { 2821 current += input_num_dims; 2822 } 2823 bool is_dup = false; 2824 for (int j = 0; j < num_resolved_axis; ++j) { 2825 if (resolved_axis[j] == current) { 2826 is_dup = true; 2827 break; 2828 } 2829 } 2830 if (!is_dup) { 2831 resolved_axis[num_resolved_axis++] = current; 2832 } 2833 } 2834 // iterates through input_data. 2835 for (bool has_next = true; has_next; 2836 has_next = NextIndex(input_num_dims, input_dims, temp_index)) { 2837 size_t input_offset = 2838 ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr); 2839 size_t output_offset = 2840 ReducedOutputOffset(input_num_dims, input_dims, temp_index, 2841 num_resolved_axis, resolved_axis); 2842 output_data[output_offset] += input_data[input_offset]; 2843 } 2844 // takes average by num of elements added to get mean. 2845 size_t num_elements_in_axis = 1; 2846 for (int idx = 0; idx < num_resolved_axis; ++idx) { 2847 num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]); 2848 } 2849 for (size_t idx = 0; idx < num_outputs; ++idx) { 2850 output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) / 2851 num_elements_in_axis); 2852 } 2853 } 2854 2855 template <typename T> 2856 inline void Mean(const T* input_data, const Dims<4>& input_dims, 2857 const std::vector<int>& reduction_indices, T* output_data, 2858 const Dims<4>& output_dims) { 2859 const int output_batch = ArraySize(output_dims, 3); 2860 const int output_height = ArraySize(output_dims, 2); 2861 const int output_width = ArraySize(output_dims, 1); 2862 const int output_depth = ArraySize(output_dims, 0); 2863 2864 const int input_height = ArraySize(input_dims, 2); 2865 const int input_width = ArraySize(input_dims, 1); 2866 2867 // The current implementation only supports simultaneous reduction over 2868 // width and height. 2869 TFLITE_DCHECK_EQ(reduction_indices.size(), 2); 2870 TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) || 2871 (reduction_indices[0] == 2 && reduction_indices[1] == 1)); 2872 TFLITE_DCHECK_EQ(output_height, 1); 2873 TFLITE_DCHECK_EQ(output_width, 1); 2874 2875 for (int out_b = 0; out_b < output_batch; ++out_b) { 2876 for (int out_d = 0; out_d < output_depth; ++out_d) { 2877 float value = 0; 2878 for (int in_h = 0; in_h < input_height; ++in_h) { 2879 for (int in_w = 0; in_w < input_width; ++in_w) { 2880 value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)]; 2881 } 2882 } 2883 output_data[Offset(output_dims, out_d, 0, 0, out_b)] = 2884 value / (input_width * input_height); 2885 } 2886 } 2887 } 2888 2889 template <typename T> 2890 void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, 2891 const Dims<4>& input2_dims, T* output_data, 2892 const Dims<4>& output_dims) { 2893 NdArrayDesc<4> desc1; 2894 NdArrayDesc<4> desc2; 2895 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 2896 2897 // In Tensorflow, the dimensions are canonically named (batch_number, row, 2898 // col, channel), with extents (batches, height, width, depth), with the 2899 // trailing dimension changing most rapidly (channels has the smallest stride, 2900 // typically 1 element). 2901 // 2902 // In generated C code, we store arrays with the dimensions reversed. The 2903 // first dimension has smallest stride. 2904 // 2905 // We name our variables by their Tensorflow convention, but generate C code 2906 // nesting loops such that the innermost loop has the smallest stride for the 2907 // best cache behavior. 2908 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 2909 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 2910 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 2911 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 2912 output_data[Offset(output_dims, c, x, y, b)] = 2913 input1_data[SubscriptToIndex(desc1, c, x, y, b)] - 2914 input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 2915 } 2916 } 2917 } 2918 } 2919 } 2920 2921 template <typename T> 2922 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, 2923 const T* input2_data, T* output_data, 2924 const Dims<4>& output_dims) { 2925 int batches = MatchingArraySize(input1_dims, 3, output_dims, 3); 2926 int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2); 2927 int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1); 2928 int depth = MatchingArraySize(input1_dims, 0, output_dims, 0); 2929 2930 auto min_value = input2_data[0]; 2931 2932 for (int b = 0; b < batches; b++) { 2933 for (int y = 0; y < input_height; y++) { 2934 for (int x = 0; x < input_width; x++) { 2935 for (int c = 0; c < depth; c++) { 2936 int offset = Offset(input1_dims, c, x, y, b); 2937 output_data[offset] = 2938 input1_data[offset] > min_value ? min_value : input1_data[offset]; 2939 } 2940 } 2941 } 2942 } 2943 } 2944 2945 template <typename T> 2946 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, 2947 const T* input2_data, T* output_data, 2948 const Dims<4>& output_dims) { 2949 int batches = MatchingArraySize(input1_dims, 3, output_dims, 3); 2950 int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2); 2951 int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1); 2952 int depth = MatchingArraySize(input1_dims, 0, output_dims, 0); 2953 2954 auto max_value = input2_data[0]; 2955 2956 for (int b = 0; b < batches; b++) { 2957 for (int y = 0; y < input_height; y++) { 2958 for (int x = 0; x < input_width; x++) { 2959 for (int c = 0; c < depth; c++) { 2960 int offset = Offset(input1_dims, c, x, y, b); 2961 output_data[offset] = 2962 input1_data[offset] < max_value ? max_value : input1_data[offset]; 2963 } 2964 } 2965 } 2966 } 2967 } 2968 2969 template <typename T1, typename T2, typename T3> 2970 void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, 2971 T2* output_data, const Dims<4>& output_dims) { 2972 // The current ArgMax implemention can only determine the index of the maximum 2973 // value in the last dimension. So the axis argument is ignored. 2974 TFLITE_DCHECK_EQ(axis[0], 3); 2975 2976 // For ArgMax, the number of output dimensions = (number of input dimensions - 2977 // 1). For the sake of simplicity, the output dimensions are equal to the 2978 // input dimensions here. We enforce the constraint that the last dimension 2979 // must always be 1. 2980 TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1); 2981 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2982 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2983 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2984 const int depth = ArraySize(input_dims, 0); 2985 for (int b = 0; b < batches; ++b) { 2986 for (int y = 0; y < height; ++y) { 2987 for (int x = 0; x < width; ++x) { 2988 auto max_value = input_data[Offset(input_dims, 0, x, y, b)]; 2989 int max_index = 0; 2990 for (int d = 1; d < depth; ++d) { 2991 const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)]; 2992 if (curr_value > max_value) { 2993 max_value = curr_value; 2994 max_index = d; 2995 } 2996 } 2997 output_data[Offset(output_dims, 0, x, y, b)] = max_index; 2998 } 2999 } 3000 } 3001 } 3002 3003 template <typename T> 3004 void Transpose(const T* input, const Dims<4>& input_dims, T* output, 3005 const Dims<4>& output_dims, int* permuted_axes) { 3006 int out_sizes[4]; 3007 // Compute the inverse permutation array so we can do an output centered 3008 // transpose. Also, check to make sure output_dims is matching input_dims. 3009 for (int k = 0; k < 4; k++) { 3010 out_sizes[k] = 3011 MatchingArraySize(input_dims, permuted_axes[k], output_dims, k); 3012 } 3013 3014 // Naive transpose loop (iterate on output index and compute input index). 3015 int o[4]; // loop index (on output). 3016 int i[4]; 3017 for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { 3018 i[permuted_axes[3]] = o[3]; 3019 for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { 3020 i[permuted_axes[2]] = o[2]; 3021 for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { 3022 i[permuted_axes[1]] = o[1]; 3023 for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { 3024 i[permuted_axes[0]] = o[0]; 3025 output[Offset(output_dims, o)] = input[Offset(input_dims, i)]; 3026 } 3027 } 3028 } 3029 } 3030 } 3031 3032 } // namespace reference_ops 3033 } // namespace tflite 3034 3035 #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 3036