1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 16 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 17 18 #include "public/gemmlowp.h" 19 #include "tensorflow/contrib/lite/kernels/internal/common.h" 20 #include "tensorflow/contrib/lite/kernels/internal/types.h" 21 22 namespace tflite { 23 namespace optimized_ops { 24 25 // Implementation of float DepthwiseConv 26 27 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 28 struct FloatDepthwiseConvKernel {}; 29 30 #ifdef USE_NEON 31 32 template <> 33 struct FloatDepthwiseConvKernel<false, 8, 1> { 34 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 35 const float* input_ptr, int input_ptr_increment, 36 const float* filter_ptr, float* acc_buffer_ptr) { 37 // Load the filters 38 float32x4_t filter[2]; 39 for (int i = 0; i < 2; i++) { 40 filter[i] = vld1q_f32(filter_ptr + 4 * i); 41 } 42 int outp = 0; 43 // Handle 2 output pixels at a time. 44 for (; outp <= num_output_pixels - 2; outp += 2) { 45 // Load the inputs 46 float32x4_t input[4]; 47 for (int i = 0; i < 4; i++) { 48 input[i] = vld1q_f32(input_ptr + 4 * i); 49 } 50 input_ptr += 16; 51 // Load the accumulators from acc_buffer 52 float32x4_t acc[4]; 53 for (int i = 0; i < 4; i++) { 54 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 55 } 56 // Multiply-accumulate 57 acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); 58 acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); 59 acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); 60 acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); 61 // Store the accumulators back to acc_buffer 62 for (int i = 0; i < 4; i++) { 63 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 64 } 65 acc_buffer_ptr += 16; 66 } 67 // Handle one output pixel at a time. 68 for (; outp < num_output_pixels; outp++) { 69 // Load the inputs 70 float32x4_t input[2]; 71 for (int i = 0; i < 2; i++) { 72 input[i] = vld1q_f32(input_ptr + 4 * i); 73 } 74 input_ptr += 8; 75 // Load the accumulators from acc_buffer 76 float32x4_t acc[2]; 77 for (int i = 0; i < 2; i++) { 78 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 79 } 80 // Multiply-accumulate 81 for (int i = 0; i < 2; i++) { 82 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); 83 } 84 // Store the accumulators back to acc_buffer 85 for (int i = 0; i < 2; i++) { 86 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 87 } 88 acc_buffer_ptr += 8; 89 } 90 } 91 }; 92 93 template <> 94 struct FloatDepthwiseConvKernel<false, 2, 1> { 95 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 96 const float* input_ptr, int input_ptr_increment, 97 const float* filter_ptr, float* acc_buffer_ptr) { 98 const float32x2_t filters = vld1_f32(filter_ptr); 99 const float32x4_t filters_dup2 = vcombine_f32(filters, filters); 100 int outp = 0; 101 // Handle 8 output pixels at a time. 102 for (; outp <= num_output_pixels - 8; outp += 8) { 103 // Load the inputs 104 float32x4_t input[4]; 105 for (int i = 0; i < 4; i++) { 106 input[i] = vld1q_f32(input_ptr + 4 * i); 107 } 108 input_ptr += 16; 109 // Load the accumulators from acc_buffer 110 float32x4_t acc[4]; 111 for (int i = 0; i < 4; i++) { 112 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 113 } 114 // Multiply-accumulate 115 for (int i = 0; i < 4; i++) { 116 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); 117 } 118 // Store the accumulators back to acc_buffer 119 for (int i = 0; i < 4; i++) { 120 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 121 } 122 acc_buffer_ptr += 16; 123 } 124 // Handle 4 output pixels at a time. 125 for (; outp <= num_output_pixels - 4; outp += 4) { 126 // Load the inputs 127 float32x4_t input[2]; 128 for (int i = 0; i < 2; i++) { 129 input[i] = vld1q_f32(input_ptr + 4 * i); 130 } 131 input_ptr += 8; 132 // Load the accumulators from acc_buffer 133 float32x4_t acc[2]; 134 for (int i = 0; i < 2; i++) { 135 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 136 } 137 // Multiply-accumulate 138 for (int i = 0; i < 2; i++) { 139 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); 140 } 141 // Store the accumulators back to acc_buffer 142 for (int i = 0; i < 2; i++) { 143 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 144 } 145 acc_buffer_ptr += 8; 146 } 147 // Handle 2 output pixels at a time. 148 for (; outp <= num_output_pixels - 2; outp += 2) { 149 // Load the inputs 150 const float32x4_t input = vld1q_f32(input_ptr); 151 input_ptr += 4; 152 // Load the accumulators from acc_buffer 153 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 154 // Multiply-accumulate 155 acc = vmlaq_f32(acc, input, filters_dup2); 156 // Store the accumulators back to acc_buffer 157 vst1q_f32(acc_buffer_ptr, acc); 158 acc_buffer_ptr += 4; 159 } 160 // Handle 1 output pixel at a time 161 for (; outp < num_output_pixels; outp++) { 162 // Load the inputs 163 const float32x2_t input = vld1_f32(input_ptr); 164 input_ptr += 2; 165 // Load the accumulators from acc_buffer 166 float32x2_t acc = vld1_f32(acc_buffer_ptr); 167 // Multiply-accumulate 168 acc = vmla_f32(acc, input, filters); 169 // Store the accumulators back to acc_buffer 170 vst1_f32(acc_buffer_ptr, acc); 171 acc_buffer_ptr += 2; 172 } 173 } 174 }; 175 176 template <> 177 struct FloatDepthwiseConvKernel<true, 0, 1> { 178 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 179 const float* input_ptr, int input_ptr_increment, 180 const float* filter_ptr, float* acc_buffer_ptr) { 181 // Handle one output pixel at a time. 182 for (int outp = 0; outp < num_output_pixels; outp++) { 183 const float* local_filter_ptr = filter_ptr; 184 const float* local_input_ptr = input_ptr; 185 int ic = 0; 186 // Handle 16 input channels at a time. 187 for (; ic <= input_depth - 16; ic += 16) { 188 // Load the filters 189 float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0); 190 float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1); 191 float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2); 192 float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3); 193 local_filter_ptr += 16; 194 // Load the inputs 195 float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0); 196 float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1); 197 float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2); 198 float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3); 199 local_input_ptr += 16; 200 // Load the accumulators from acc_buffer 201 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 202 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 203 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 204 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 205 // Multiply-accumulate 206 acc_0 = vmlaq_f32(acc_0, input_0, filter_0); 207 acc_1 = vmlaq_f32(acc_1, input_1, filter_1); 208 acc_2 = vmlaq_f32(acc_2, input_2, filter_2); 209 acc_3 = vmlaq_f32(acc_3, input_3, filter_3); 210 // Store the accumulators back to acc_buffer 211 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 212 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 213 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 214 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 215 acc_buffer_ptr += 16; 216 } 217 // Handle 4 input channels at a time. 218 for (; ic <= input_depth - 4; ic += 4) { 219 // Load the filters 220 float32x4_t filter; 221 filter = vld1q_f32(local_filter_ptr); 222 local_filter_ptr += 4; 223 // Load the inputs 224 float32x4_t input; 225 input = vld1q_f32(local_input_ptr); 226 local_input_ptr += 4; 227 // Load the accumulators from acc_buffer 228 float32x4_t acc; 229 acc = vld1q_f32(acc_buffer_ptr); 230 // Multiply-accumulate 231 acc = vmlaq_f32(acc, input, filter); 232 // Store the accumulators back to acc_buffer 233 vst1q_f32(acc_buffer_ptr, acc); 234 acc_buffer_ptr += 4; 235 } 236 // Handle one input channel at a time. 237 for (; ic < input_depth; ic++) { 238 const float input_val = *local_input_ptr++; 239 const float filter_val = *local_filter_ptr++; 240 *acc_buffer_ptr++ += filter_val * input_val; 241 } 242 input_ptr += input_ptr_increment; 243 } 244 } 245 }; 246 247 template <> 248 struct FloatDepthwiseConvKernel<true, 0, 8> { 249 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 250 const float* input_ptr, int input_ptr_increment, 251 const float* filter_ptr, float* acc_buffer_ptr) { 252 // Handle one output pixel at a time. 253 for (int outp = 0; outp < num_output_pixels; outp++) { 254 const float* local_filter_ptr = filter_ptr; 255 const float* local_input_ptr = input_ptr; 256 int ic = 0; 257 // Handle 2 input channels at a time. 258 for (; ic <= input_depth - 2; ic += 2) { 259 // Load the filters 260 float32x4_t filter[4]; 261 for (int i = 0; i < 4; i++) { 262 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 263 } 264 local_filter_ptr += 16; 265 // Load the inputs 266 const float32x2_t input = vld1_f32(local_input_ptr); 267 local_input_ptr += 2; 268 // Load the accumulators from acc_buffer 269 float32x4_t acc[4]; 270 for (int i = 0; i < 4; i++) { 271 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 272 } 273 // Multiply-accumulate 274 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); 275 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); 276 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); 277 acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); 278 // Store the accumulators back to acc_buffer 279 for (int i = 0; i < 4; i++) { 280 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 281 } 282 acc_buffer_ptr += 16; 283 } 284 // Handle one input channel at a time. 285 for (; ic < input_depth; ic++) { 286 // Load the filters 287 float32x4_t filter[2]; 288 for (int i = 0; i < 2; i++) { 289 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 290 } 291 local_filter_ptr += 8; 292 // Load the inputs 293 const float input_val = *local_input_ptr++; 294 // Load the accumulators from acc_buffer 295 float32x4_t acc[2]; 296 for (int i = 0; i < 2; i++) { 297 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 298 } 299 // Multiply-accumulate 300 for (int i = 0; i < 2; i++) { 301 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 302 } 303 // Store the accumulators back to acc_buffer 304 for (int i = 0; i < 2; i++) { 305 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 306 } 307 acc_buffer_ptr += 8; 308 } 309 input_ptr += input_ptr_increment; 310 } 311 } 312 }; 313 314 // Note this implementation is very slow for input_depths < 8 315 // (e.g. comparable to reference implementation) see, specializations for 316 // input_depth=3 below. 317 template <> 318 struct FloatDepthwiseConvKernel<true, 0, 2> { 319 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 320 const float* input_ptr, int input_ptr_increment, 321 const float* filter_ptr, float* acc_buffer_ptr) { 322 // Handle one output pixel at a time. 323 for (int outp = 0; outp < num_output_pixels; outp++) { 324 const float* local_filter_ptr = filter_ptr; 325 const float* local_input_ptr = input_ptr; 326 int ic = 0; 327 // Handle 8 input channels at a time. 328 for (; ic <= input_depth - 8; ic += 8) { 329 // Load the filters 330 float32x4_t filter[4]; 331 for (int i = 0; i < 4; i++) { 332 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 333 } 334 local_filter_ptr += 16; 335 // Load the inputs 336 float32x4x2_t input_dup2[2]; 337 for (int i = 0; i < 2; i++) { 338 const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); 339 input_dup2[i] = vzipq_f32(input, input); 340 } 341 local_input_ptr += 8; 342 // Load the accumulators from acc_buffer 343 float32x4_t acc[4]; 344 for (int i = 0; i < 4; i++) { 345 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 346 } 347 // Multiply-accumulate 348 acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); 349 acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); 350 acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); 351 acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); 352 // Store the accumulators back to acc_buffer 353 for (int i = 0; i < 4; i++) { 354 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 355 } 356 acc_buffer_ptr += 16; 357 } 358 // Handle 4 input channels at a time. 359 for (; ic <= input_depth - 4; ic += 4) { 360 // Load the filters 361 float32x2_t filter[4]; 362 for (int i = 0; i < 4; i++) { 363 filter[i] = vld1_f32(local_filter_ptr + 2 * i); 364 } 365 local_filter_ptr += 8; 366 // Load the inputs 367 const float32x4_t input = vld1q_f32(local_input_ptr); 368 local_input_ptr += 4; 369 // Load the accumulators from acc_buffer 370 float32x2_t acc[4]; 371 for (int i = 0; i < 4; i++) { 372 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 373 } 374 // Multiply-accumulate 375 acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); 376 acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); 377 acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); 378 acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); 379 // Store the accumulators back to acc_buffer 380 for (int i = 0; i < 4; i++) { 381 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 382 } 383 acc_buffer_ptr += 8; 384 } 385 // Handle 2 input channels at a time. 386 for (; ic <= input_depth - 2; ic += 2) { 387 // Load the filters 388 const float32x4_t filter = vld1q_f32(local_filter_ptr); 389 local_filter_ptr += 4; 390 // Load the inputs 391 const float32x2_t input = vld1_f32(local_input_ptr); 392 local_input_ptr += 2; 393 // Load the accumulators from acc_buffer 394 float32x2_t acc[2]; 395 for (int i = 0; i < 2; i++) { 396 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 397 } 398 // Multiply-accumulate 399 acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); 400 acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); 401 // Store the accumulators back to acc_buffer 402 for (int i = 0; i < 2; i++) { 403 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 404 } 405 acc_buffer_ptr += 4; 406 } 407 // Handle one input channel at a time. 408 for (; ic < input_depth; ic++) { 409 // Load the inputs 410 const float input_val = *local_input_ptr++; 411 // Multiply-accumulate 412 for (int i = 0; i < 2; i++) { 413 acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; 414 } 415 local_filter_ptr += 2; 416 acc_buffer_ptr += 2; 417 } 418 input_ptr += input_ptr_increment; 419 } 420 } 421 }; 422 423 template <> 424 struct FloatDepthwiseConvKernel<true, 3, 2> { 425 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 426 const float* input_ptr, int input_ptr_increment, 427 const float* filter_ptr, float* acc_buffer_ptr) { 428 // Load the filters 429 float32x2_t filter[3]; 430 for (int i = 0; i < 3; i++) { 431 filter[i] = vld1_f32(filter_ptr + 2 * i); 432 } 433 // Handle one output pixel at a time. 434 for (int outp = 0; outp < num_output_pixels; outp++) { 435 const float32x2_t input01 = vld1_f32(input_ptr); 436 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); 437 // Load the accumulators from acc_buffer 438 float32x2_t acc[3]; 439 for (int i = 0; i < 3; i++) { 440 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 441 } 442 // Multiply-accumulate for each input channel there 2 outputs 443 acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0); 444 acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1); 445 acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0); 446 // Store the accumulators back to acc_buffer 447 for (int i = 0; i < 3; i++) { 448 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 449 } 450 acc_buffer_ptr += 6; 451 input_ptr += input_ptr_increment; 452 } 453 } 454 }; 455 456 template <> 457 struct FloatDepthwiseConvKernel<true, 3, 4> { 458 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 459 const float* input_ptr, int input_ptr_increment, 460 const float* filter_ptr, float* acc_buffer_ptr) { 461 // Load the filters 462 float32x4_t filter[3]; 463 for (int i = 0; i < 3; i++) { 464 filter[i] = vld1q_f32(filter_ptr + 4 * i); 465 } 466 // Handle one output pixel at a time. 467 for (int outp = 0; outp < num_output_pixels; outp++) { 468 // NOTE: we only want 3 values, so we read it as two ops where 469 // the second op just duplicates the lane 470 const float32x2_t input01 = vld1_f32(input_ptr); 471 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); 472 // Load the accumulators from acc_buffer 473 float32x4_t acc[3]; 474 for (int i = 0; i < 3; i++) { 475 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 476 } 477 // Multiply-accumulate all outputs. 478 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0); 479 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1); 480 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0); 481 // Store the accumulators back to acc_buffer 482 for (int i = 0; i < 3; i++) { 483 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 484 } 485 acc_buffer_ptr += 12; 486 input_ptr += input_ptr_increment; 487 } 488 } 489 }; 490 491 template <> 492 struct FloatDepthwiseConvKernel<true, 1, 8> { 493 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 494 const float* input_ptr, int input_ptr_increment, 495 const float* filter_ptr, float* acc_buffer_ptr) { 496 // Load the filters 497 float32x4_t filter[2]; 498 for (int i = 0; i < 2; i++) { 499 filter[i] = vld1q_f32(filter_ptr + 4 * i); 500 } 501 // Handle one output pixel at a time. 502 for (int outp = 0; outp < num_output_pixels; outp++) { 503 // Load the inputs 504 const float input_val = *input_ptr; 505 input_ptr += input_ptr_increment; 506 // Load the accumulators from acc_buffer 507 float32x4_t acc[2]; 508 for (int i = 0; i < 2; i++) { 509 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 510 } 511 // Multiply-accumulate 512 for (int i = 0; i < 2; i++) { 513 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 514 } 515 // Store the accumulators back to acc_buffer 516 for (int i = 0; i < 2; i++) { 517 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 518 } 519 acc_buffer_ptr += 8; 520 } 521 } 522 }; 523 524 template <> 525 struct FloatDepthwiseConvKernel<true, 1, 32> { 526 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 527 const float* input_ptr, int input_ptr_increment, 528 const float* filter_ptr, float* acc_buffer_ptr) { 529 // Load the filters 530 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); 531 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); 532 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); 533 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); 534 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); 535 float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5); 536 float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6); 537 float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7); 538 539 // Handle one output pixel at a time. 540 for (int outp = 0; outp < num_output_pixels; outp++) { 541 // Load the inputs 542 const float input_val = *input_ptr; 543 input_ptr += input_ptr_increment; 544 // Load the accumulators from acc_buffer 545 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 546 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 547 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 548 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 549 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); 550 float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5); 551 float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6); 552 float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7); 553 // Multiply-accumulate 554 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); 555 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); 556 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); 557 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); 558 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); 559 acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val); 560 acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val); 561 acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val); 562 // Store the accumulators back to acc_buffer 563 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 564 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 565 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 566 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 567 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); 568 vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5); 569 vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6); 570 vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7); 571 acc_buffer_ptr += 32; 572 } 573 } 574 }; 575 576 template <> 577 struct FloatDepthwiseConvKernel<true, 1, 20> { 578 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 579 const float* input_ptr, int input_ptr_increment, 580 const float* filter_ptr, float* acc_buffer_ptr) { 581 // Load the filters 582 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); 583 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); 584 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); 585 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); 586 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); 587 588 // Handle one output pixel at a time. 589 for (int outp = 0; outp < num_output_pixels; outp++) { 590 // Load the inputs 591 const float input_val = *input_ptr; 592 input_ptr += input_ptr_increment; 593 // Load the accumulators from acc_buffer 594 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 595 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 596 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 597 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 598 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); 599 // Multiply-accumulate 600 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); 601 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); 602 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); 603 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); 604 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); 605 // Store the accumulators back to acc_buffer 606 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 607 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 608 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 609 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 610 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); 611 acc_buffer_ptr += 20; 612 } 613 } 614 }; 615 616 template <> 617 struct FloatDepthwiseConvKernel<true, 0, 16> { 618 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 619 const float* input_ptr, int input_ptr_increment, 620 const float* filter_ptr, float* acc_buffer_ptr) { 621 // Handle one output pixel at a time. 622 for (int outp = 0; outp < num_output_pixels; outp++) { 623 const float* local_filter_ptr = filter_ptr; 624 const float* local_input_ptr = input_ptr; 625 for (int ic = 0; ic < input_depth; ic++) { 626 // Load the filters 627 float32x4_t filter[4]; 628 for (int i = 0; i < 4; i++) { 629 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 630 } 631 local_filter_ptr += 16; 632 // Load the inputs 633 const float input_val = *local_input_ptr++; 634 // Load the accumulators from acc_buffer 635 float32x4_t acc[4]; 636 for (int i = 0; i < 4; i++) { 637 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 638 } 639 // Multiply-accumulate 640 for (int i = 0; i < 4; i++) { 641 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 642 } 643 // Store the accumulators back to acc_buffer 644 for (int i = 0; i < 4; i++) { 645 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 646 } 647 acc_buffer_ptr += 16; 648 } 649 input_ptr += input_ptr_increment; 650 } 651 } 652 }; 653 654 template <> 655 struct FloatDepthwiseConvKernel<true, 8, 1> { 656 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 657 const float* input_ptr, int input_ptr_increment, 658 const float* filter_ptr, float* acc_buffer_ptr) { 659 // Load the filters 660 float32x4_t filter[2]; 661 for (int i = 0; i < 2; i++) { 662 filter[i] = vld1q_f32(filter_ptr + 4 * i); 663 } 664 // Handle one output pixel at a time. 665 for (int outp = 0; outp < num_output_pixels; outp++) { 666 // Load the inputs 667 float32x4_t input[2]; 668 for (int i = 0; i < 2; i++) { 669 input[i] = vld1q_f32(input_ptr + 4 * i); 670 } 671 // Load the accumulators from acc_buffer 672 float32x4_t acc[2]; 673 for (int i = 0; i < 2; i++) { 674 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 675 } 676 // Multiply-accumulate 677 for (int i = 0; i < 2; i++) { 678 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); 679 } 680 // Store the accumulators back to acc_buffer 681 for (int i = 0; i < 2; i++) { 682 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 683 } 684 acc_buffer_ptr += 8; 685 input_ptr += input_ptr_increment; 686 } 687 } 688 }; 689 690 template <> 691 struct FloatDepthwiseConvKernel<true, 2, 1> { 692 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 693 const float* input_ptr, int input_ptr_increment, 694 const float* filter_ptr, float* acc_buffer_ptr) { 695 float32x2_t filter = vld1_f32(filter_ptr); 696 float32x4_t filter_x4 = vcombine_f32(filter, filter); 697 int outp = 0; 698 699 // Handle two output pixels at a time. 700 for (; outp <= num_output_pixels - 2; outp += 2) { 701 // Load the inputs 702 float32x2_t input_1 = vld1_f32(input_ptr); 703 input_ptr += input_ptr_increment; 704 float32x2_t input_2 = vld1_f32(input_ptr); 705 input_ptr += input_ptr_increment; 706 float32x4_t input = vcombine_f32(input_1, input_2); 707 708 // Load the accumulators from acc_buffer 709 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 710 711 // Multiply-accumulate 712 acc = vmlaq_f32(acc, input, filter_x4); 713 714 // Store the accumulators back to acc_buffer 715 vst1q_f32(acc_buffer_ptr, acc); 716 acc_buffer_ptr += 4; 717 } 718 // Handle one output pixel at a time. 719 for (; outp < num_output_pixels; outp++) { 720 // Load the inputs 721 float32x2_t input = vld1_f32(input_ptr); 722 input_ptr += input_ptr_increment; 723 724 // Load the accumulators from acc_buffer 725 float32x2_t acc = vld1_f32(acc_buffer_ptr); 726 727 // Multiply-accumulate 728 acc = vmla_f32(acc, input, filter); 729 730 // Store the accumulators back to acc_buffer 731 vst1_f32(acc_buffer_ptr, acc); 732 acc_buffer_ptr += 2; 733 } 734 } 735 }; 736 737 template <> 738 struct FloatDepthwiseConvKernel<true, 4, 1> { 739 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 740 const float* input_ptr, int input_ptr_increment, 741 const float* filter_ptr, float* acc_buffer_ptr) { 742 float32x4_t filter = vld1q_f32(filter_ptr); 743 744 // Handle one output pixel at a time. 745 for (int outp = 0; outp < num_output_pixels; outp++) { 746 // Load the inputs 747 float32x4_t input = vld1q_f32(input_ptr); 748 // Load the accumulators from acc_buffer 749 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 750 // Multiply-accumulate 751 acc = vmlaq_f32(acc, input, filter); 752 // Store the accumulators back to acc_buffer 753 vst1q_f32(acc_buffer_ptr, acc); 754 acc_buffer_ptr += 4; 755 input_ptr += input_ptr_increment; 756 } 757 } 758 }; 759 #endif 760 761 // Accumulates the effect of one row of the filter, on a segment of one row 762 // of the output, accessing the corresponding one row of the input. 763 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 764 void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width, 765 const float* input_data, int pad_width, 766 int depth_multiplier, int filter_width, 767 const float* filter_data, 768 int out_x_buffer_start, int out_x_buffer_end, 769 int output_depth, float* acc_buffer) { 770 #ifdef GEMMLOWP_PROFILING 771 gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); 772 #endif 773 // Sanity check parameters. This is important in particular to ensure 774 // that we keep the number of template instantiations minimal, so we don't 775 // increase binary size unnecessarily. 776 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); 777 static_assert(kFixedInputDepth || kAllowStrided, ""); 778 TFLITE_DCHECK(stride == 1 || kAllowStrided); 779 if (kFixedInputDepth) { 780 TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth); 781 } 782 if (kFixedDepthMultiplier) { 783 TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); 784 } 785 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 786 const int input_ptr_increment = stride * input_depth; 787 const float* filter_base_ptr = filter_data; 788 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 789 // For the current (filter_x, filter_y) point in the filter, 790 // compute the boundaries of the corresponding output row segment. 791 int out_x_loop_start_unclampled = 0; 792 int out_x_loop_end_unclampled = 0; 793 if (kAllowStrided) { 794 if (stride == 2) { 795 out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; 796 out_x_loop_end_unclampled = 797 (pad_width + input_width - filter_x + 1) / 2; 798 } else if (stride == 4) { 799 out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; 800 out_x_loop_end_unclampled = 801 (pad_width + input_width - filter_x + 3) / 4; 802 } else { 803 out_x_loop_start_unclampled = 804 (pad_width - filter_x + stride - 1) / stride; 805 out_x_loop_end_unclampled = 806 (pad_width + input_width - filter_x + stride - 1) / stride; 807 } 808 } else { 809 out_x_loop_start_unclampled = pad_width - filter_x; 810 out_x_loop_end_unclampled = pad_width + input_width - filter_x; 811 } 812 // The kernel will have to iterate on the segment of the 813 // output row that starts at out_x_loop_start and out_x_loop_end. 814 const int out_x_loop_start = 815 std::max(out_x_buffer_start, out_x_loop_start_unclampled); 816 const int out_x_loop_end = 817 std::min(out_x_buffer_end, out_x_loop_end_unclampled); 818 819 float* acc_buffer_ptr = 820 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 821 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 822 const float* input_ptr = input_data + in_x_origin * input_depth; 823 const int num_output_pixels = out_x_loop_end - out_x_loop_start; 824 FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, 825 kFixedDepthMultiplier>::Run(num_output_pixels, 826 input_depth, 827 depth_multiplier, 828 input_ptr, 829 input_ptr_increment, 830 filter_base_ptr, 831 acc_buffer_ptr); 832 filter_base_ptr += output_depth; 833 } 834 } 835 836 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. 837 inline void FloatDepthwiseConvAccumRowGeneric( 838 int stride, int input_depth, int input_width, const float* input_data, 839 int pad_width, int depth_multiplier, int filter_width, 840 const float* filter_data, int out_x_buffer_start, int out_x_buffer_end, 841 int output_depth, float* acc_buffer) { 842 gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); 843 #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 844 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 845 LOG(FATAL) 846 << "\n\n" 847 << "*****************************************************************\n" 848 << "* This tfmini inference code was about to use the slow generic\n" 849 << "* fallback implementation for a DepthwiseConv op, and we want you\n" 850 << "* to be aware of that so that you will know why you get terrible\n" 851 << "* performance.\n" 852 << "*\n" 853 << "* If you would like to carry on with the slow code, compile\n" 854 << "* with this preprocessor token defined:\n" 855 << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n" 856 << "*\n" 857 << "* The right thing to do, if you care about performance, is to add\n" 858 << "* a new DepthwiseConv kernel to tfmini to cover your case.\n" 859 << "* The relevant parameters defining your case are:\n" 860 << "* stride = " << stride << "\n" 861 << "* input_depth = " << input_depth << "\n" 862 << "* depth_multiplier = " << depth_multiplier << "\n" 863 << "*\n" 864 << "* Please do not hesitate to contact benoitjacob@ with this\n" 865 << "* information.\n" 866 << "*****************************************************************\n"; 867 #endif // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 868 #endif // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 869 const float* filter_base_ptr = filter_data; 870 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 871 const int out_x_loop_start = std::max( 872 out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); 873 const int out_x_loop_end = 874 std::min(out_x_buffer_end, 875 (pad_width + input_width - filter_x + stride - 1) / stride); 876 877 float* acc_buffer_ptr = 878 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 879 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 880 const float* input_ptr = input_data + in_x_origin * input_depth; 881 const int input_ptr_increment = (stride - 1) * input_depth; 882 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { 883 const float* filter_ptr = filter_base_ptr; 884 for (int ic = 0; ic < input_depth; ++ic) { 885 const float input_val = *input_ptr++; 886 for (int m = 0; m < depth_multiplier; m++) { 887 const float filter_val = *filter_ptr++; 888 *acc_buffer_ptr++ += filter_val * input_val; 889 } 890 } 891 input_ptr += input_ptr_increment; 892 } 893 filter_base_ptr += output_depth; 894 } 895 } 896 897 // Initializes the accumulator buffer with bias values. 898 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, 899 const float* bias_data, 900 float* acc_buffer) { 901 // TODO(benoitjacob): This might need optimized specializations 902 // for small output_depth values, if that ever becomes an important 903 // case (like it was for some quantized DepthwiseConv cases). 904 for (int i = 0; i < num_output_pixels; i++) { 905 memcpy(acc_buffer + i * output_depth, bias_data, 906 sizeof(acc_buffer[0]) * output_depth); 907 } 908 } 909 910 inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, 911 const float* filter_data, const Dims<4>& filter_dims, 912 const float* bias_data, const Dims<4>& bias_dims, 913 int stride_width, int stride_height, int pad_width, 914 int pad_height, int depth_multiplier, 915 float output_activation_min, 916 float output_activation_max, float* output_data, 917 const Dims<4>& output_dims) { 918 gemmlowp::ScopedProfilingLabel label("DepthwiseConv"); 919 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 920 const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); 921 const int input_height = ArraySize(input_dims, 2); 922 const int input_width = ArraySize(input_dims, 1); 923 const int input_depth = ArraySize(input_dims, 0); 924 const int filter_height = ArraySize(filter_dims, 2); 925 const int filter_width = ArraySize(filter_dims, 1); 926 const int output_height = ArraySize(output_dims, 2); 927 const int output_width = ArraySize(output_dims, 1); 928 TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); 929 930 static const int kAccBufferMaxSize = 2048; 931 float acc_buffer[kAccBufferMaxSize]; 932 TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth); 933 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; 934 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; 935 TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, 936 kAccBufferActualSize); 937 TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); 938 TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1); 939 940 // row_accum_func will point to the core accumulation function to be used 941 // for this DepthwiseConv op. 942 using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric); 943 row_accum_func_t row_accum_func = nullptr; 944 945 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 946 FIXED_DEPTH_MULTIPLIER) \ 947 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ 948 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ 949 depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ 950 row_accum_func = \ 951 FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 952 FIXED_DEPTH_MULTIPLIER>; \ 953 } 954 955 #ifdef USE_NEON 956 // We go over our list of kernels by decreasing order of preference 957 // for the cases where multiple kernels could apply. 958 959 // Start with the fastest kernels: AllowStrided=false, fixed input depth. 960 961 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) 962 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) 963 964 // Next come the strided kernels: AllowStrided=true, fixed input depth. 965 // They are a bit less efficient, but allow stride!=1. 966 967 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) 968 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) 969 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) 970 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) 971 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) 972 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2) 973 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4) 974 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) 975 976 // Finally, the kernels allowing a variable input depth, 977 // these are the least efficient but most general kernels. 978 979 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) 980 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) 981 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) 982 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) 983 984 #endif // USE_NEON 985 986 #undef TFMINI_USE_DEPTHWISECONV_KERNEL 987 988 // No matching fast kernel found, use slow fallback. 989 if (!row_accum_func) { 990 row_accum_func = FloatDepthwiseConvAccumRowGeneric; 991 } 992 993 // Now that we have determined row_accum_func, we can start work. 994 float* output_ptr = output_data; 995 for (int b = 0; b < batches; ++b) { 996 for (int out_y = 0; out_y < output_height; ++out_y) { 997 const int in_y_origin = (out_y * stride_height) - pad_height; 998 const int filter_y_start = std::max(0, -in_y_origin); 999 const int filter_y_end = 1000 std::min(filter_height, input_height - in_y_origin); 1001 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; 1002 out_x_buffer_start += kOutputPixelsInAccBuffer) { 1003 const int out_x_buffer_end = std::min( 1004 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); 1005 // We call a 'pixel' a group of activation that share all but the 1006 // 'depth'/'channel' coordinate. num_output_pixels is the number of 1007 // output pixels that we will accumulate in this loop iteration. 1008 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; 1009 // Initialize our local accumulator with the bias values, so we don't 1010 // have to add them later. 1011 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, 1012 acc_buffer); 1013 // Accumulation loop. Most of the time should be spent in here. 1014 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1015 ++filter_y) { 1016 const int in_y = in_y_origin + filter_y; 1017 row_accum_func(stride_width, input_depth, input_width, 1018 input_data + in_y * input_dims.strides[2] + 1019 b * input_dims.strides[3], 1020 pad_width, depth_multiplier, filter_width, 1021 filter_data + filter_y * filter_dims.strides[2], 1022 out_x_buffer_start, out_x_buffer_end, output_depth, 1023 acc_buffer); 1024 } 1025 // Finished accumulating. Now store to destination. 1026 const int num_output_values = output_depth * num_output_pixels; 1027 int i = 0; 1028 // TODO(benoitjacob) optimized code goes here 1029 #ifdef USE_NEON 1030 // Handle 16 values at a time 1031 for (; i <= num_output_values - 16; i += 16) { 1032 float32x4_t acc[4]; 1033 for (int k = 0; k < 4; k++) { 1034 acc[k] = vld1q_f32(acc_buffer + i + 4 * k); 1035 } 1036 for (int k = 0; k < 4; k++) { 1037 acc[k] = vmaxq_f32( 1038 vdupq_n_f32(output_activation_min), 1039 vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); 1040 } 1041 for (int k = 0; k < 4; k++) { 1042 vst1q_f32(output_ptr + 4 * k, acc[k]); 1043 } 1044 output_ptr += 16; 1045 } 1046 // Handle 4 values at a time 1047 for (; i <= num_output_values - 4; i += 4) { 1048 float32x4_t acc = vld1q_f32(acc_buffer + i); 1049 1050 acc = vmaxq_f32(vdupq_n_f32(output_activation_min), 1051 vminq_f32(vdupq_n_f32(output_activation_max), acc)); 1052 1053 vst1q_f32(output_ptr, acc); 1054 output_ptr += 4; 1055 } 1056 #endif 1057 // Handle leftover values, one by one. This is very slow. 1058 for (; i < num_output_values; i++) { 1059 float acc = acc_buffer[i]; 1060 acc = std::max(output_activation_min, 1061 std::min(output_activation_max, acc)); 1062 1063 *output_ptr++ = acc; 1064 } 1065 } 1066 } 1067 } 1068 } 1069 1070 // legacy, for compatibility with old checked-in code 1071 template <FusedActivationFunctionType Ac> 1072 void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, 1073 const float* filter_data, const Dims<4>& filter_dims, 1074 const float* bias_data, const Dims<4>& bias_dims, 1075 int stride_width, int stride_height, int pad_width, 1076 int pad_height, int depth_multiplier, float* output_data, 1077 const Dims<4>& output_dims) { 1078 float output_activation_min, output_activation_max; 1079 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1080 DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, 1081 bias_dims, stride_width, stride_height, pad_width, pad_height, 1082 depth_multiplier, output_activation_min, output_activation_max, 1083 output_data, output_dims); 1084 } 1085 1086 // legacy, for compatibility with old checked-in code 1087 template <FusedActivationFunctionType Ac> 1088 void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, 1089 const float* filter_data, const Dims<4>& filter_dims, 1090 const float* bias_data, const Dims<4>& bias_dims, int stride, 1091 int pad_width, int pad_height, int depth_multiplier, 1092 float* output_data, const Dims<4>& output_dims) { 1093 DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, 1094 bias_dims, stride, stride, pad_width, pad_height, 1095 depth_multiplier, output_data, output_dims); 1096 } 1097 1098 } // namespace optimized_ops 1099 } // namespace tflite 1100 1101 #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 1102