1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 16 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 17 18 #include "fixedpoint/fixedpoint.h" 19 #include "public/gemmlowp.h" 20 #include "tensorflow/contrib/lite/kernels/internal/common.h" 21 #include "tensorflow/contrib/lite/kernels/internal/types.h" 22 23 namespace tflite { 24 namespace optimized_ops { 25 26 // Implementation of quantized DepthwiseConv 27 28 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 29 struct QuantizedDepthwiseConvKernel {}; 30 31 #ifdef USE_NEON 32 template <> 33 struct QuantizedDepthwiseConvKernel<true, 8, 2> { 34 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 35 const uint8* input_ptr, int16 input_offset, 36 int input_ptr_increment, const uint8* filter_ptr, 37 int16 filter_offset, int32* acc_buffer_ptr) { 38 // Load the filters, add filter_offset. 39 uint8x8x2_t filter_u8; 40 filter_u8.val[0] = vld1_u8(filter_ptr); 41 filter_u8.val[1] = vld1_u8(filter_ptr + 8); 42 int16x8_t filter[2]; 43 for (int i = 0; i < 2; i++) { 44 filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), 45 vdupq_n_s16(filter_offset)); 46 } 47 // Handle one output pixel at a time. 48 for (int outp = 0; outp < num_output_pixels; outp++) { 49 // Load the accumulators from acc_buffer 50 int32x4x2_t acc[2]; 51 for (int i = 0; i < 2; i++) { 52 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 53 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 54 } 55 // Load the inputs, add input_offset. 56 const uint8x8_t input_u8 = vld1_u8(input_ptr); 57 input_ptr += input_ptr_increment; 58 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 59 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 60 // Duplicate the input values, 2-fold 61 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 62 // Multiply-accumulate 63 for (int i = 0; i < 2; i++) { 64 acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), 65 vget_low_s16(input_dup2.val[i])); 66 acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), 67 vget_high_s16(input_dup2.val[i])); 68 } 69 // Store the accumulators back to acc_buffer 70 for (int i = 0; i < 2; i++) { 71 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 72 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 73 } 74 acc_buffer_ptr += 16; 75 } 76 } 77 }; 78 79 template <> 80 struct QuantizedDepthwiseConvKernel<false, 8, 1> { 81 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 82 const uint8* input_ptr, int16 input_offset, 83 int input_ptr_increment, const uint8* filter_ptr, 84 int16 filter_offset, int32* acc_buffer_ptr) { 85 // Load the filters, add filter_offset. 86 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 87 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 88 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 89 90 int outp = 0; 91 // Handle 2 output pixels at a time. 92 for (; outp <= num_output_pixels - 2; outp += 2) { 93 // Load the accumulators from acc_buffer. 94 int32x4_t acc[4]; 95 for (int i = 0; i < 4; i++) { 96 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 97 } 98 // Load the inputs, add input_offset. 99 uint8x8_t input_u8[2]; 100 for (int i = 0; i < 2; i++) { 101 input_u8[i] = vld1_u8(input_ptr + 8 * i); 102 } 103 input_ptr += 16; 104 int16x8_t input[2]; 105 for (int i = 0; i < 2; i++) { 106 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 107 } 108 for (int i = 0; i < 2; i++) { 109 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 110 } 111 // Multiply-accumulate. 112 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); 113 acc[1] = 114 vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); 115 acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); 116 acc[3] = 117 vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); 118 // Store the accumulators back to acc_buffer 119 for (int i = 0; i < 4; i++) { 120 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 121 } 122 acc_buffer_ptr += 16; 123 } 124 // Handle 1 output pixel at a time. 125 for (; outp < num_output_pixels; outp++) { 126 // Load the accumulators from acc_buffer. 127 int32x4_t acc[2]; 128 acc[0] = vld1q_s32(acc_buffer_ptr); 129 acc[1] = vld1q_s32(acc_buffer_ptr + 4); 130 131 // Load the inputs, add input_offset. 132 const uint8x8_t input_u8 = vld1_u8(input_ptr); 133 input_ptr += 8; 134 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 135 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 136 // Multiply-accumulate. 137 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); 138 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); 139 // Store the accumulators back to acc_buffer 140 vst1q_s32(acc_buffer_ptr, acc[0]); 141 vst1q_s32(acc_buffer_ptr + 4, acc[1]); 142 acc_buffer_ptr += 8; 143 } 144 } 145 }; 146 147 template <> 148 struct QuantizedDepthwiseConvKernel<false, 4, 2> { 149 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 150 const uint8* input_ptr, int16 input_offset, 151 int input_ptr_increment, const uint8* filter_ptr, 152 int16 filter_offset, int32* acc_buffer_ptr) { 153 // Load the filters, add filter_offset. 154 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 155 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 156 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 157 158 int outp = 0; 159 // Handle 2 output pixels at a time. 160 for (; outp <= num_output_pixels - 2; outp += 2) { 161 // Load the accumulators from acc_buffer 162 int32x4_t acc[4]; 163 for (int i = 0; i < 4; i++) { 164 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 165 } 166 // Load the inputs, add input_offset. 167 const uint8x8_t input_u8 = vld1_u8(input_ptr); 168 input_ptr += 8; 169 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 170 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 171 // Duplicate the input values, 2-fold 172 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 173 // Multiply-accumulate 174 for (int i = 0; i < 2; i++) { 175 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), 176 vget_low_s16(input_dup2.val[i])); 177 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), 178 vget_high_s16(input_dup2.val[i])); 179 } 180 // Store the accumulators back to acc_buffer 181 for (int i = 0; i < 4; i++) { 182 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 183 } 184 acc_buffer_ptr += 16; 185 } 186 // Handle one output pixel at a time. 187 for (; outp < num_output_pixels; outp++) { 188 // Load the accumulators from acc_buffer 189 int32x4_t acc[2]; 190 for (int i = 0; i < 2; i++) { 191 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 192 } 193 // Load the inputs, add input_offset. 194 uint8x8_t input_u8 = vdup_n_u8(0); 195 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 196 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 197 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 198 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 199 input_ptr += 4; 200 const int16x4_t input_s16 = 201 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 202 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 203 // Duplicate the input values, 2-fold 204 const int16x4x2_t input_dup2 = vzip_s16(input, input); 205 // Multiply-accumulate 206 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); 207 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); 208 // Store the accumulators back to acc_buffer 209 for (int i = 0; i < 2; i++) { 210 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 211 } 212 acc_buffer_ptr += 8; 213 } 214 } 215 }; 216 217 template <> 218 struct QuantizedDepthwiseConvKernel<false, 2, 8> { 219 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 220 const uint8* input_ptr, int16 input_offset, 221 int input_ptr_increment, const uint8* filter_ptr, 222 int16 filter_offset, int32* acc_buffer_ptr) { 223 // Load the filters, add filter_offset. 224 int16x8_t filter[2]; 225 for (int i = 0; i < 2; i++) { 226 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); 227 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 228 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 229 } 230 int outp = 0; 231 // Handle two output pixels at a time. 232 for (; outp <= num_output_pixels - 2; outp += 2) { 233 // Load the accumulators from acc_buffer. 234 int32x4_t acc[8]; 235 for (int i = 0; i < 8; i++) { 236 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 237 } 238 // Load the inputs, add input_offset. 239 uint8x8_t input_u8 = vdup_n_u8(0); 240 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 241 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 242 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 243 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 244 input_ptr += 4; 245 const int16x4_t input_s16 = 246 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 247 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 248 // Multiply-accumulate. 249 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 250 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); 251 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); 252 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); 253 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); 254 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); 255 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); 256 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); 257 // Store the accumulators back to acc_buffer. 258 for (int i = 0; i < 8; i++) { 259 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 260 } 261 acc_buffer_ptr += 32; 262 } 263 // Handle one output pixel at a time. 264 for (; outp < num_output_pixels; outp++) { 265 // Load the accumulators from acc_buffer. 266 int32x4_t acc[4]; 267 for (int i = 0; i < 4; i++) { 268 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 269 } 270 // Load the inputs, add input_offset. 271 uint8x8_t input_u8 = vdup_n_u8(0); 272 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 273 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 274 input_ptr += 2; 275 const int16x4_t input_s16 = 276 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 277 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 278 279 // Multiply-accumulate. 280 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 281 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); 282 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); 283 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); 284 285 // Store the accumulators back to acc_buffer. 286 for (int i = 0; i < 4; i++) { 287 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 288 } 289 acc_buffer_ptr += 16; 290 } 291 } 292 }; 293 294 template <> 295 struct QuantizedDepthwiseConvKernel<false, 2, 2> { 296 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 297 const uint8* input_ptr, int16 input_offset, 298 int input_ptr_increment, const uint8* filter_ptr, 299 int16 filter_offset, int32* acc_buffer_ptr) { 300 // Load the filters, add filter_offset. 301 uint8x8_t filter_u8 = vdup_n_u8(0); 302 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 303 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 304 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 305 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 306 const int16x4_t filter_s16 = 307 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 308 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 309 310 int outp = 0; 311 // Handle 4 output pixels at a time. 312 for (; outp <= num_output_pixels - 4; outp += 4) { 313 // Load the accumulators from acc_buffer 314 int32x4_t acc[4]; 315 for (int i = 0; i < 4; i++) { 316 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 317 } 318 319 // Load the inputs, add input_offset. 320 const uint8x8_t input_u8 = vld1_u8(input_ptr); 321 input_ptr += 8; 322 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 323 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 324 // Duplicate the input values, 2-fold 325 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 326 // Multiply-accumulate 327 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); 328 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); 329 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); 330 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); 331 // Store the accumulators back to acc_buffer 332 for (int i = 0; i < 4; i++) { 333 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 334 } 335 acc_buffer_ptr += 16; 336 } 337 // Handle one output pixel at a time. 338 for (; outp < num_output_pixels; outp++) { 339 // Load the accumulators from acc_buffer 340 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 341 342 uint8x8_t input_u8 = vdup_n_u8(0); 343 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 344 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 345 input_ptr += 2; 346 const int16x4_t input_s16 = 347 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 348 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 349 // Duplicate the input values, 2-fold 350 const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; 351 // Multiply-accumulate 352 acc = vmlal_s16(acc, filter, input_dup2); 353 // Store the accumulators back to acc_buffer 354 vst1q_s32(acc_buffer_ptr, acc); 355 acc_buffer_ptr += 4; 356 } 357 } 358 }; 359 360 template <> 361 struct QuantizedDepthwiseConvKernel<false, 2, 1> { 362 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 363 const uint8* input_ptr, int16 input_offset, 364 int input_ptr_increment, const uint8* filter_ptr, 365 int16 filter_offset, int32* acc_buffer_ptr) { 366 // Load the filters, add filter_offset. 367 uint8x8_t filter_u8 = vdup_n_u8(0); 368 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 369 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 370 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 371 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 372 const int16x4_t filter_s16 = 373 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 374 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 375 376 int outp = 0; 377 // Handle 8 output pixels at a time. 378 for (; outp <= num_output_pixels - 8; outp += 8) { 379 // Load the accumulators from acc_buffer. 380 int32x4_t acc[4]; 381 for (int i = 0; i < 4; i++) { 382 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 383 } 384 // Load the inputs, add input_offset. 385 uint8x8_t input_u8[2]; 386 for (int i = 0; i < 2; i++) { 387 input_u8[i] = vld1_u8(input_ptr + 8 * i); 388 } 389 input_ptr += 16; 390 int16x8_t input[2]; 391 for (int i = 0; i < 2; i++) { 392 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 393 } 394 for (int i = 0; i < 2; i++) { 395 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 396 } 397 398 // Multiply-accumulate. 399 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); 400 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); 401 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); 402 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); 403 // Store the accumulators back to acc_buffer. 404 for (int i = 0; i < 4; i++) { 405 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 406 } 407 acc_buffer_ptr += 16; 408 } 409 // Handle 4 output pixels at a time. 410 for (; outp <= num_output_pixels - 4; outp += 4) { 411 // Load the accumulators from acc_buffer. 412 int32x4_t acc[2]; 413 for (int i = 0; i < 2; i++) { 414 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 415 } 416 // Load the inputs, add input_offset. 417 const uint8x8_t input_u8 = vld1_u8(input_ptr); 418 input_ptr += 8; 419 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 420 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 421 422 // Multiply-accumulate. 423 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); 424 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); 425 // Store the accumulators back to acc_buffer. 426 for (int i = 0; i < 2; i++) { 427 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 428 } 429 acc_buffer_ptr += 8; 430 } 431 // Handle 2 output pixels at a time. 432 for (; outp <= num_output_pixels - 2; outp += 2) { 433 // Load the accumulators from acc_buffer. 434 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 435 // Load the inputs, add input_offset. 436 uint8x8_t input_u8 = vdup_n_u8(0); 437 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 438 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 439 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 440 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 441 input_ptr += 4; 442 const int16x4_t input_s16 = 443 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 444 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 445 446 // Multiply-accumulate. 447 acc = vmlal_s16(acc, filter, input); 448 // Store the accumulators back to acc_buffer. 449 vst1q_s32(acc_buffer_ptr, acc); 450 acc_buffer_ptr += 4; 451 } 452 // Handle 1 output pixel at a time. 453 for (; outp < num_output_pixels; outp++) { 454 // Load the accumulators from acc_buffer. 455 int32x2_t acc = vld1_s32(acc_buffer_ptr); 456 // Load the inputs, add input_offset. 457 uint8x8_t input_u8 = vdup_n_u8(0); 458 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 459 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 460 input_ptr += 2; 461 const int16x4_t input_s16 = 462 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 463 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 464 465 // Multiply-accumulate. 466 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); 467 // Store the accumulators back to acc_buffer. 468 vst1_s32(acc_buffer_ptr, acc); 469 acc_buffer_ptr += 2; 470 } 471 } 472 }; 473 474 template <> 475 struct QuantizedDepthwiseConvKernel<false, 1, 2> { 476 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 477 const uint8* input_ptr, int16 input_offset, 478 int input_ptr_increment, const uint8* filter_ptr, 479 int16 filter_offset, int32* acc_buffer_ptr) { 480 // Load the filters, add filter_offset. 481 uint8x8_t filter_u8 = vdup_n_u8(0); 482 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 483 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 484 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 485 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 486 const int16x4_t filter_s16 = 487 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 488 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 489 490 int outp = 0; 491 // Handle 8 output pixels at a time. 492 for (; outp <= num_output_pixels - 8; outp += 8) { 493 // Load the accumulators from acc_buffer 494 int32x4_t acc[4]; 495 for (int i = 0; i < 4; i++) { 496 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 497 } 498 499 // Load the inputs, add input_offset. 500 const uint8x8_t input_u8 = vld1_u8(input_ptr); 501 input_ptr += 8; 502 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 503 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 504 // Duplicate the input values, 2-fold 505 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 506 // Multiply-accumulate 507 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); 508 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); 509 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); 510 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); 511 // Store the accumulators back to acc_buffer 512 for (int i = 0; i < 4; i++) { 513 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 514 } 515 acc_buffer_ptr += 16; 516 } 517 // Handle one output pixel at a time. 518 for (; outp < num_output_pixels; outp++) { 519 // Load the accumulators from acc_buffer 520 int32x2_t acc = vld1_s32(acc_buffer_ptr); 521 522 // Load the inputs, add input_offset. 523 const uint32 input = *input_ptr++ + input_offset; 524 525 // Multiply-accumulate 526 acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); 527 // Store the accumulators back to acc_buffer 528 vst1_s32(acc_buffer_ptr, acc); 529 acc_buffer_ptr += 2; 530 } 531 } 532 }; 533 534 template <> 535 struct QuantizedDepthwiseConvKernel<false, 1, 4> { 536 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 537 const uint8* input_ptr, int16 input_offset, 538 int input_ptr_increment, const uint8* filter_ptr, 539 int16 filter_offset, int32* acc_buffer_ptr) { 540 // Load the filters, add filter_offset. 541 uint8x8_t filter_u8 = vdup_n_u8(0); 542 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 543 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 544 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 545 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 546 const int16x4_t filter_s16 = 547 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 548 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 549 550 int outp = 0; 551 // Handle 8 output pixels at a time. 552 for (; outp <= num_output_pixels - 8; outp += 8) { 553 // Load the accumulators from acc_buffer 554 int32x4_t acc[8]; 555 for (int i = 0; i < 8; i++) { 556 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 557 } 558 559 // Load the inputs, add input_offset. 560 uint8x8_t input_u8 = vld1_u8(input_ptr); 561 input_ptr += 8; 562 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 563 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 564 565 // Multiply-accumulate 566 acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); 567 acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); 568 acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); 569 acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); 570 acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); 571 acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); 572 acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); 573 acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); 574 575 // Store the accumulators back to acc_buffer 576 for (int i = 0; i < 8; i++) { 577 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 578 } 579 acc_buffer_ptr += 32; 580 } 581 // Handle 4 output pixels at a time. 582 for (; outp <= num_output_pixels - 4; outp += 4) { 583 // Load the accumulators from acc_buffer 584 int32x4_t acc[4]; 585 for (int i = 0; i < 4; i++) { 586 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 587 } 588 589 // Load the inputs, add input_offset. 590 uint8x8_t input_u8 = vdup_n_u8(0); 591 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 592 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 593 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 594 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 595 input_ptr += 4; 596 const int16x4_t input_s16 = 597 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 598 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 599 600 // Multiply-accumulate 601 acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); 602 acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); 603 acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); 604 acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); 605 606 // Store the accumulators back to acc_buffer 607 for (int i = 0; i < 4; i++) { 608 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 609 } 610 acc_buffer_ptr += 16; 611 } 612 // Handle one output pixel at a time. 613 for (; outp < num_output_pixels; outp++) { 614 // Load the accumulators from acc_buffer 615 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 616 617 // Load the inputs, add input_offset. 618 const uint32 input = *input_ptr++ + input_offset; 619 620 // Multiply-accumulate 621 acc = vmlal_n_s16(acc, filter, input); 622 // Store the accumulators back to acc_buffer 623 vst1q_s32(acc_buffer_ptr, acc); 624 acc_buffer_ptr += 4; 625 } 626 } 627 }; 628 629 template <> 630 struct QuantizedDepthwiseConvKernel<false, 4, 1> { 631 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 632 const uint8* input_ptr, int16 input_offset, 633 int input_ptr_increment, const uint8* filter_ptr, 634 int16 filter_offset, int32* acc_buffer_ptr) { 635 // Load the filters, add filter_offset. 636 uint8x8_t filter_u8 = vdup_n_u8(0); 637 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 638 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 639 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 640 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 641 const int16x4_t filter_s16 = 642 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 643 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 644 645 int outp = 0; 646 // Handle 4 output pixels at a time. 647 for (; outp <= num_output_pixels - 4; outp += 4) { 648 // Load the accumulators from acc_buffer 649 int32x4_t acc[4]; 650 for (int i = 0; i < 4; i++) { 651 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 652 } 653 // Load the inputs, add input_offset. 654 int16x8_t input[2]; 655 for (int i = 0; i < 2; i++) { 656 const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); 657 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 658 input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 659 } 660 input_ptr += 16; 661 // Multiply-accumulate 662 for (int i = 0; i < 2; i++) { 663 acc[2 * i + 0] = 664 vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); 665 acc[2 * i + 1] = 666 vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); 667 } 668 // Store the accumulators back to acc_buffer 669 for (int i = 0; i < 4; i++) { 670 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 671 } 672 acc_buffer_ptr += 16; 673 } 674 // Handle one output pixel at a time. 675 for (; outp < num_output_pixels; outp++) { 676 // Load the accumulators from acc_buffer 677 int32x4_t acc; 678 acc = vld1q_s32(acc_buffer_ptr); 679 680 // Load the inputs, add input_offset. 681 uint8x8_t input_u8 = vdup_n_u8(0); 682 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 683 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 684 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 685 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 686 input_ptr += 4; 687 const int16x4_t input_s16 = 688 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 689 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 690 // Multiply-accumulate 691 acc = vmlal_s16(acc, filter, input); 692 // Store the accumulators back to acc_buffer 693 vst1q_s32(acc_buffer_ptr, acc); 694 acc_buffer_ptr += 4; 695 } 696 } 697 }; 698 699 template <> 700 struct QuantizedDepthwiseConvKernel<false, 4, 4> { 701 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 702 const uint8* input_ptr, int16 input_offset, 703 int input_ptr_increment, const uint8* filter_ptr, 704 int16 filter_offset, int32* acc_buffer_ptr) { 705 // Load the filters, add filter_offset. 706 int16x8_t filter[2]; 707 for (int i = 0; i < 2; i++) { 708 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); 709 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 710 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 711 } 712 713 int outp = 0; 714 // Handle 2 output pixels at a time. 715 for (; outp <= num_output_pixels - 2; outp += 2) { 716 // Load the accumulators from acc_buffer 717 int32x4_t acc[8]; 718 for (int i = 0; i < 8; i++) { 719 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 720 } 721 722 // Load the inputs, add input_offset. 723 uint8x8_t input_u8 = vld1_u8(input_ptr); 724 input_ptr += 8; 725 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 726 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 727 728 // Multiply-accumulate 729 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), 730 vget_low_s16(input), 0); 731 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), 732 vget_low_s16(input), 1); 733 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), 734 vget_low_s16(input), 2); 735 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), 736 vget_low_s16(input), 3); 737 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), 738 vget_high_s16(input), 0); 739 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), 740 vget_high_s16(input), 1); 741 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), 742 vget_high_s16(input), 2); 743 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), 744 vget_high_s16(input), 3); 745 // Store the accumulators back to acc_buffer 746 for (int i = 0; i < 8; i++) { 747 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 748 } 749 acc_buffer_ptr += 32; 750 } 751 // Handle one output pixel at a time. 752 for (; outp < num_output_pixels; outp++) { 753 // Load the accumulators from acc_buffer 754 int32x4_t acc[4]; 755 for (int i = 0; i < 4; i++) { 756 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 757 } 758 759 // Load the inputs, add input_offset. 760 uint8x8_t input_u8 = vdup_n_u8(0); 761 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 762 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 763 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 764 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 765 input_ptr += 4; 766 const int16x4_t input_s16 = 767 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 768 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 769 770 // Multiply-accumulate 771 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 772 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); 773 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); 774 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); 775 // Store the accumulators back to acc_buffer 776 for (int i = 0; i < 4; i++) { 777 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 778 } 779 acc_buffer_ptr += 16; 780 } 781 } 782 }; 783 784 template <> 785 struct QuantizedDepthwiseConvKernel<true, 0, 3> { 786 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 787 const uint8* input_ptr, int16 input_offset, 788 int input_ptr_increment, const uint8* filter_ptr, 789 int16 filter_offset, int32* acc_buffer_ptr) { 790 // We will have to duplicate bytes in a NEON register, 3-fold. 791 // We will do that by register-level table-look-up using VTBL instructions. 792 // Here we prepare the registers containing the table-lookup indices. 793 static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2}, 794 {2, 3, 3, 3, 4, 4, 4, 5}, 795 {5, 5, 6, 6, 6, 7, 7, 7}}; 796 uint8x8_t dup3_indices[3]; 797 for (int i = 0; i < 3; i++) { 798 dup3_indices[i] = vld1_u8(dup3_indices_array[i]); 799 } 800 801 // Handle one output pixel at a time. 802 for (int outp = 0; outp < num_output_pixels; outp++) { 803 const uint8* local_filter_ptr = filter_ptr; 804 const uint8* local_input_ptr = input_ptr; 805 int ic = 0; 806 // Handle 8 input channels at a time. 807 for (; ic <= input_depth - 8; ic += 8) { 808 // Load the filters, add filter_offset. 809 int16x8_t filter[3]; 810 uint8x8x3_t filter_u8; 811 filter_u8.val[0] = vld1_u8(local_filter_ptr); 812 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); 813 filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); 814 local_filter_ptr += 24; 815 for (int i = 0; i < 3; i++) { 816 const int16x8_t filter_s16 = 817 vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); 818 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 819 } 820 // Load the inputs, duplicate 3-fold, add input_offset. 821 const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 822 local_input_ptr += 8; 823 824 uint8x8_t input_u8_dup3[3]; 825 for (int i = 0; i < 3; i++) { 826 input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); 827 } 828 int16x8_t input_dup3[3]; 829 for (int i = 0; i < 3; i++) { 830 const int16x8_t input_s16_dup3 = 831 vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); 832 input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); 833 } 834 // Load the accumulators from acc_buffer 835 int32x4x3_t acc[2]; 836 for (int i = 0; i < 2; i++) { 837 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 838 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 839 acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); 840 } 841 // Multiply-accumulate 842 for (int j = 0; j < 3; j++) { 843 acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), 844 vget_low_s16(filter[j])); 845 acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), 846 vget_high_s16(filter[j])); 847 } 848 // Store the accumulators back to acc_buffer 849 for (int i = 0; i < 2; i++) { 850 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 851 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 852 vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); 853 } 854 acc_buffer_ptr += 24; 855 } 856 // Handle one input channel at a time. 857 for (; ic < input_depth; ic++) { 858 const int16 input_val = *local_input_ptr++ + input_offset; 859 for (int i = 0; i < 3; i++) { 860 const int16 filter_val = local_filter_ptr[i] + filter_offset; 861 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 862 } 863 local_filter_ptr += 3; 864 } 865 input_ptr += input_ptr_increment; 866 } 867 } 868 }; 869 870 template <> 871 struct QuantizedDepthwiseConvKernel<true, 0, 2> { 872 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 873 const uint8* input_ptr, int16 input_offset, 874 int input_ptr_increment, const uint8* filter_ptr, 875 int16 filter_offset, int32* acc_buffer_ptr) { 876 // Handle one output pixel at a time. 877 for (int outp = 0; outp < num_output_pixels; outp++) { 878 const uint8* local_filter_ptr = filter_ptr; 879 const uint8* local_input_ptr = input_ptr; 880 int ic = 0; 881 // Handle 8 input channels at a time. 882 for (; ic <= input_depth - 8; ic += 8) { 883 // Load the filters, add filter_offset. 884 int16x8_t filter[2]; 885 uint8x8x2_t filter_u8; 886 filter_u8.val[0] = vld1_u8(local_filter_ptr); 887 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); 888 local_filter_ptr += 16; 889 for (int i = 0; i < 2; i++) { 890 const int16x8_t filter_s16 = 891 vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); 892 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 893 } 894 // Load the inputs, add input_offset, duplicate 2-fold. 895 const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 896 local_input_ptr += 8; 897 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 898 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 899 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 900 // Load the accumulators from acc_buffer. 901 int32x4x2_t acc[2]; 902 for (int i = 0; i < 2; i++) { 903 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 904 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 905 } 906 // Multiply-accumulate. 907 for (int j = 0; j < 2; j++) { 908 acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), 909 vget_low_s16(input_dup2.val[j])); 910 acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), 911 vget_high_s16(input_dup2.val[j])); 912 } 913 // Store the accumulators back to acc_buffer. 914 for (int i = 0; i < 2; i++) { 915 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 916 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 917 } 918 acc_buffer_ptr += 16; 919 } 920 // Handle one input channel at a time. 921 for (; ic < input_depth; ic++) { 922 // Load the inputs. 923 const int16 input_val = *local_input_ptr++ + input_offset; 924 for (int i = 0; i < 2; i++) { 925 const int16 filter_val = local_filter_ptr[i] + filter_offset; 926 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 927 } 928 local_filter_ptr += 2; 929 } 930 input_ptr += input_ptr_increment; 931 } 932 } 933 }; 934 935 template <> 936 struct QuantizedDepthwiseConvKernel<true, 0, 1> { 937 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 938 const uint8* input_ptr, int16 input_offset, 939 int input_ptr_increment, const uint8* filter_ptr, 940 int16 filter_offset, int32* acc_buffer_ptr) { 941 // Handle one output pixel at a time. 942 for (int outp = 0; outp < num_output_pixels; outp++) { 943 const uint8* local_filter_ptr = filter_ptr; 944 const uint8* local_input_ptr = input_ptr; 945 int ic = 0; 946 // Handle 16 input channels at a time. 947 for (; ic <= input_depth - 16; ic += 16) { 948 // Load the filters, add filter_offset. 949 uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0); 950 uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1); 951 local_filter_ptr += 16; 952 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 953 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 954 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); 955 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); 956 // Load the inputs, add input_offset. 957 uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0); 958 uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1); 959 local_input_ptr += 16; 960 int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); 961 int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); 962 input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); 963 input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); 964 // Load the accumulators from acc_buffer 965 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 966 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 967 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 968 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); 969 acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0)); 970 acc_1 = 971 vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0)); 972 acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1)); 973 acc_3 = 974 vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1)); 975 // Store the accumulators back to acc_buffer 976 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 977 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 978 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 979 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); 980 acc_buffer_ptr += 16; 981 } 982 // Handle 8 input channels at a time. 983 for (; ic <= input_depth - 8; ic += 8) { 984 // Load the filters, add filter_offset. 985 const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); 986 local_filter_ptr += 8; 987 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 988 const int16x8_t filter = 989 vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 990 // Load the inputs, add input_offset. 991 const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 992 local_input_ptr += 8; 993 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 994 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 995 // Load the accumulators from acc_buffer 996 int32x4_t acc[2]; 997 for (int i = 0; i < 2; i++) { 998 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 999 } 1000 // Multiply-accumulate 1001 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); 1002 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); 1003 // Store the accumulators back to acc_buffer 1004 for (int i = 0; i < 2; i++) { 1005 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1006 } 1007 acc_buffer_ptr += 8; 1008 } 1009 // Handle one input channel at a time. 1010 for (; ic < input_depth; ic++) { 1011 const int16 input_val = *local_input_ptr++ + input_offset; 1012 const int16 filter_val = *local_filter_ptr++ + filter_offset; 1013 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 1014 } 1015 input_ptr += input_ptr_increment; 1016 } 1017 } 1018 }; 1019 1020 template <> 1021 struct QuantizedDepthwiseConvKernel<true, 16, 1> { 1022 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1023 const uint8* input_ptr, int16 input_offset, 1024 int input_ptr_increment, const uint8* filter_ptr, 1025 int16 filter_offset, int32* acc_buffer_ptr) { 1026 // Load the filters, add filter_offset. 1027 uint8x8_t filter_u8[2]; 1028 for (int i = 0; i < 2; i++) { 1029 filter_u8[i] = vld1_u8(filter_ptr + 8 * i); 1030 } 1031 int16x8_t filter[2]; 1032 for (int i = 0; i < 2; i++) { 1033 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 1034 } 1035 for (int i = 0; i < 2; i++) { 1036 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 1037 } 1038 // Handle one output pixel at a time. 1039 for (int outp = 0; outp < num_output_pixels; outp++) { 1040 // Load the inputs, add input_offset. 1041 uint8x8_t input_u8[2]; 1042 for (int i = 0; i < 2; i++) { 1043 input_u8[i] = vld1_u8(input_ptr + 8 * i); 1044 } 1045 input_ptr += input_ptr_increment; 1046 int16x8_t input[2]; 1047 for (int i = 0; i < 2; i++) { 1048 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 1049 } 1050 for (int i = 0; i < 2; i++) { 1051 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 1052 } 1053 // Load the accumulators from acc_buffer 1054 int32x4_t acc[4]; 1055 for (int i = 0; i < 4; i++) { 1056 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1057 } 1058 // Multiply-accumulate 1059 for (int i = 0; i < 2; i++) { 1060 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), 1061 vget_low_s16(filter[i])); 1062 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), 1063 vget_high_s16(filter[i])); 1064 } 1065 // Store the accumulators back to acc_buffer 1066 for (int i = 0; i < 4; i++) { 1067 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1068 } 1069 acc_buffer_ptr += 16; 1070 } 1071 } 1072 }; 1073 1074 template <> 1075 struct QuantizedDepthwiseConvKernel<true, 8, 1> { 1076 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1077 const uint8* input_ptr, int16 input_offset, 1078 int input_ptr_increment, const uint8* filter_ptr, 1079 int16 filter_offset, int32* acc_buffer_ptr) { 1080 // Load the filters, add filter_offset. 1081 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 1082 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 1083 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 1084 // Handle one output pixel at a time. 1085 for (int outp = 0; outp < num_output_pixels; outp++) { 1086 // Load the inputs, add input_offset. 1087 const uint8x8_t input_u8 = vld1_u8(input_ptr); 1088 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 1089 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 1090 // Load the accumulators from acc_buffer 1091 int32x4_t acc[2]; 1092 for (int i = 0; i < 2; i++) { 1093 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1094 } 1095 // Multiply-accumulate 1096 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); 1097 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); 1098 // Store the accumulators back to acc_buffer 1099 for (int i = 0; i < 2; i++) { 1100 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1101 } 1102 acc_buffer_ptr += 8; 1103 input_ptr += input_ptr_increment; 1104 } 1105 } 1106 }; 1107 1108 template <> 1109 struct QuantizedDepthwiseConvKernel<true, 1, 16> { 1110 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1111 const uint8* input_ptr, int16 input_offset, 1112 int input_ptr_increment, const uint8* filter_ptr, 1113 int16 filter_offset, int32* acc_buffer_ptr) { 1114 // Load the filters, add filter_offset. 1115 uint8x8_t filter_u8[2]; 1116 for (int i = 0; i < 2; i++) { 1117 filter_u8[i] = vld1_u8(filter_ptr + 8 * i); 1118 } 1119 int16x8_t filter[2]; 1120 for (int i = 0; i < 2; i++) { 1121 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 1122 } 1123 for (int i = 0; i < 2; i++) { 1124 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 1125 } 1126 // Handle one output pixel at a time. 1127 for (int outp = 0; outp < num_output_pixels; outp++) { 1128 uint8 input_u8 = *input_ptr; 1129 input_ptr += input_ptr_increment; 1130 int16 input = static_cast<int16>(input_u8 + input_offset); 1131 // Load the accumulators from acc_buffer 1132 int32x4_t acc[4]; 1133 for (int i = 0; i < 4; i++) { 1134 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1135 } 1136 // Multiply-accumulate 1137 for (int i = 0; i < 2; i++) { 1138 acc[2 * i + 0] = 1139 vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); 1140 acc[2 * i + 1] = 1141 vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); 1142 } 1143 // Store the accumulators back to acc_buffer 1144 for (int i = 0; i < 4; i++) { 1145 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1146 } 1147 acc_buffer_ptr += 16; 1148 } 1149 } 1150 }; 1151 1152 template <> 1153 struct QuantizedDepthwiseConvKernel<true, 1, 32> { 1154 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1155 const uint8* input_ptr, int16 input_offset, 1156 int input_ptr_increment, const uint8* filter_ptr, 1157 int16 filter_offset, int32* acc_buffer_ptr) { 1158 // Load the filters, add filter_offset. 1159 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); 1160 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); 1161 uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2); 1162 uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3); 1163 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 1164 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 1165 int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2)); 1166 int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3)); 1167 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); 1168 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); 1169 filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset)); 1170 filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset)); 1171 // Handle one output pixel at a time. 1172 for (int outp = 0; outp < num_output_pixels; outp++) { 1173 uint8 input_u8 = *input_ptr; 1174 input_ptr += input_ptr_increment; 1175 int16 input = static_cast<int16>(input_u8 + input_offset); 1176 // Load the accumulators from acc_buffer 1177 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 1178 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 1179 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 1180 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); 1181 int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); 1182 int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5); 1183 int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6); 1184 int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7); 1185 // Multiply-accumulate 1186 acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); 1187 acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); 1188 acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); 1189 acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); 1190 acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input); 1191 acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input); 1192 acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input); 1193 acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input); 1194 // Store the accumulators back to acc_buffer 1195 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 1196 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 1197 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 1198 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); 1199 vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); 1200 vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5); 1201 vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6); 1202 vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7); 1203 acc_buffer_ptr += 32; 1204 } 1205 } 1206 }; 1207 1208 template <> 1209 struct QuantizedDepthwiseConvKernel<true, 1, 20> { 1210 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1211 const uint8* input_ptr, int16 input_offset, 1212 int input_ptr_increment, const uint8* filter_ptr, 1213 int16 filter_offset, int32* acc_buffer_ptr) { 1214 // Load the filters, add filter_offset. 1215 // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8. 1216 // We load the first 16 bytes into filter_u8_{0,1} as usual. 1217 // Then we load the 8 last bytes into filter_u8_x (x for 'extra'). 1218 // This is redundant: the first 4 bytes of filter_u8_x are the same 1219 // as the last 4 bytes of filter_u8_x. 1220 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); 1221 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); 1222 uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4); 1223 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 1224 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 1225 int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x)); 1226 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); 1227 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); 1228 filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset)); 1229 // Handle one output pixel at a time. 1230 for (int outp = 0; outp < num_output_pixels; outp++) { 1231 uint8 input_u8 = *input_ptr; 1232 input_ptr += input_ptr_increment; 1233 int16 input = static_cast<int16>(input_u8 + input_offset); 1234 // Load the accumulators from acc_buffer 1235 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 1236 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 1237 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 1238 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); 1239 int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); 1240 // Multiply-accumulate 1241 acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); 1242 acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); 1243 acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); 1244 acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); 1245 acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input); 1246 // Store the accumulators back to acc_buffer 1247 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 1248 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 1249 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 1250 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); 1251 vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); 1252 acc_buffer_ptr += 20; 1253 } 1254 } 1255 }; 1256 1257 template <> 1258 struct QuantizedDepthwiseConvKernel<true, 1, 8> { 1259 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1260 const uint8* input_ptr, int16 input_offset, 1261 int input_ptr_increment, const uint8* filter_ptr, 1262 int16 filter_offset, int32* acc_buffer_ptr) { 1263 // Load the filters, add filter_offset. 1264 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 1265 const int16x8_t filter = vaddq_s16( 1266 vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); 1267 // Handle one output pixel at a time. 1268 for (int outp = 0; outp < num_output_pixels; outp++) { 1269 uint8 input_u8 = *input_ptr; 1270 input_ptr += input_ptr_increment; 1271 int16 input = static_cast<int16>(input_u8 + input_offset); 1272 // Load the accumulators from acc_buffer 1273 int32x4_t acc[2]; 1274 for (int i = 0; i < 2; i++) { 1275 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1276 } 1277 // Multiply-accumulate 1278 acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); 1279 acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); 1280 // Store the accumulators back to acc_buffer 1281 for (int i = 0; i < 2; i++) { 1282 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1283 } 1284 acc_buffer_ptr += 8; 1285 } 1286 } 1287 }; 1288 1289 template <> 1290 struct QuantizedDepthwiseConvKernel<true, 2, 1> { 1291 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1292 const uint8* input_ptr, int16 input_offset, 1293 int input_ptr_increment, const uint8* filter_ptr, 1294 int16 filter_offset, int32* acc_buffer_ptr) { 1295 // Load the filters, add filter_offset. 1296 uint8x8_t filter_u8 = vdup_n_u8(0); 1297 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 1298 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 1299 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 1300 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 1301 const int16x4_t filter_s16 = 1302 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 1303 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 1304 1305 int outp = 0; 1306 1307 // Handle 2 output pixels at a time. 1308 for (; outp <= num_output_pixels - 2; outp += 2) { 1309 // Load the accumulators from acc_buffer. 1310 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 1311 // Load the inputs, add input_offset. 1312 uint16x4_t input_u16 = vdup_n_u16(0); 1313 input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0], 1314 input_u16, 0); 1315 input_ptr += input_ptr_increment; 1316 input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0], 1317 input_u16, 1); 1318 input_ptr += input_ptr_increment; 1319 const int16x4_t input_s16 = vreinterpret_s16_u16( 1320 vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); 1321 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 1322 1323 // Multiply-accumulate. 1324 acc = vmlal_s16(acc, filter, input); 1325 // Store the accumulators back to acc_buffer. 1326 vst1q_s32(acc_buffer_ptr, acc); 1327 acc_buffer_ptr += 4; 1328 } 1329 1330 // Handle 1 output pixel at a time. 1331 for (; outp < num_output_pixels; outp++) { 1332 // Load the accumulators from acc_buffer. 1333 int32x2_t acc = vld1_s32(acc_buffer_ptr); 1334 // Load the inputs, add input_offset. 1335 uint8x8_t input_u8 = vdup_n_u8(0); 1336 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 1337 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 1338 input_ptr += input_ptr_increment; 1339 const int16x4_t input_s16 = 1340 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 1341 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 1342 1343 // Multiply-accumulate. 1344 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); 1345 // Store the accumulators back to acc_buffer. 1346 vst1_s32(acc_buffer_ptr, acc); 1347 acc_buffer_ptr += 2; 1348 } 1349 } 1350 }; 1351 1352 template <> 1353 struct QuantizedDepthwiseConvKernel<true, 4, 1> { 1354 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1355 const uint8* input_ptr, int16 input_offset, 1356 int input_ptr_increment, const uint8* filter_ptr, 1357 int16 filter_offset, int32* acc_buffer_ptr) { 1358 if (num_output_pixels <= 0) { 1359 return; 1360 } 1361 1362 // Load the filters, add filter_offset. 1363 uint8x8_t filter_u8 = vdup_n_u8(0); 1364 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 1365 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 1366 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 1367 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 1368 const int16x4_t filter_s16 = 1369 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 1370 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 1371 1372 int outp = 0; 1373 1374 // Handle one output pixel at a time until second to the last pixel. Second 1375 // to the last because we read eight input pixels while only processing 1376 // four. 1377 for (; outp < num_output_pixels - 1; outp++) { 1378 // Load the accumulators from acc_buffer 1379 int32x4_t acc; 1380 acc = vld1q_s32(acc_buffer_ptr); 1381 1382 // Load the inputs, add input_offset. 1383 uint8x8_t input_u8 = vld1_u8(input_ptr); 1384 input_ptr += input_ptr_increment; 1385 const int16x4_t input_s16 = 1386 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 1387 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 1388 // Multiply-accumulate 1389 acc = vmlal_s16(acc, filter, input); 1390 // Store the accumulators back to acc_buffer 1391 vst1q_s32(acc_buffer_ptr, acc); 1392 acc_buffer_ptr += 4; 1393 } 1394 1395 // Handle the last output pixel. 1396 // Load the accumulators from acc_buffer 1397 int32x4_t acc; 1398 acc = vld1q_s32(acc_buffer_ptr); 1399 1400 // Load the inputs, add input_offset. 1401 uint8x8_t input_u8 = vdup_n_u8(0); 1402 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 1403 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 1404 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 1405 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 1406 const int16x4_t input_s16 = 1407 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 1408 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 1409 // Multiply-accumulate 1410 acc = vmlal_s16(acc, filter, input); 1411 // Store the accumulators back to acc_buffer 1412 vst1q_s32(acc_buffer_ptr, acc); 1413 } 1414 }; 1415 1416 template <> 1417 struct QuantizedDepthwiseConvKernel<false, 12, 1> { 1418 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1419 const uint8* input_ptr, int16 input_offset, 1420 int input_ptr_increment, const uint8* filter_ptr, 1421 int16 filter_offset, int32* acc_buffer_ptr) { 1422 // Load the filters, add filter_offset. 1423 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr); 1424 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4); 1425 int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 1426 int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 1427 filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset)); 1428 filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset)); 1429 int16x4_t filter_0 = vget_low_s16(filter_s16_0); 1430 int16x4_t filter_1 = vget_high_s16(filter_s16_0); 1431 int16x4_t filter_2 = vget_high_s16(filter_s16_1); 1432 1433 // Handle one output pixel at a time. 1434 for (int outp = 0; outp < num_output_pixels; outp++) { 1435 // Load the inputs, add input_offset. 1436 uint8x8_t input_u8_0 = vld1_u8(input_ptr); 1437 uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4); 1438 input_ptr += input_ptr_increment; 1439 int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); 1440 int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); 1441 input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); 1442 input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); 1443 1444 // Load the accumulators from acc_buffer 1445 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 1446 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 1447 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 1448 1449 // Multiply-accumulate 1450 acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0); 1451 acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1); 1452 acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2); 1453 1454 // Store the accumulators back to acc_buffer 1455 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 1456 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 1457 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 1458 1459 acc_buffer_ptr += 12; 1460 } 1461 } 1462 }; 1463 #endif 1464 1465 // Accumulates the effect of one row of the filter, on a segment of one row 1466 // of the output, accessing the corresponding one row of the input. 1467 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 1468 void QuantizedDepthwiseConvAccumRow( 1469 int stride, int input_depth, int input_width, const uint8* input_data, 1470 int16 input_offset, int pad_width, int depth_multiplier, int filter_width, 1471 const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, 1472 int out_x_buffer_end, int output_depth, int32* acc_buffer) { 1473 #ifdef GEMMLOWP_PROFILING 1474 gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); 1475 #endif 1476 // Sanity check parameters. This is important in particular to ensure 1477 // that we keep the number of template instantiations minimal, so we don't 1478 // increase binary size unnecessarily. 1479 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); 1480 static_assert(kFixedInputDepth || kAllowStrided, ""); 1481 TFLITE_DCHECK(stride == 1 || kAllowStrided); 1482 if (kFixedInputDepth) { 1483 TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth); 1484 } 1485 if (kFixedDepthMultiplier) { 1486 TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); 1487 } 1488 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 1489 const int input_ptr_increment = stride * input_depth; 1490 const uint8* filter_base_ptr = filter_data; 1491 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 1492 // For the current (filter_x, filter_y) point in the filter, 1493 // compute the boundaries of the corresponding output row segment. 1494 int out_x_loop_start_unclampled = 0; 1495 int out_x_loop_end_unclampled = 0; 1496 if (kAllowStrided) { 1497 if (stride == 2) { 1498 out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; 1499 out_x_loop_end_unclampled = 1500 (pad_width + input_width - filter_x + 1) / 2; 1501 } else if (stride == 4) { 1502 out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; 1503 out_x_loop_end_unclampled = 1504 (pad_width + input_width - filter_x + 3) / 4; 1505 } else { 1506 out_x_loop_start_unclampled = 1507 (pad_width - filter_x + stride - 1) / stride; 1508 out_x_loop_end_unclampled = 1509 (pad_width + input_width - filter_x + stride - 1) / stride; 1510 } 1511 } else { 1512 out_x_loop_start_unclampled = pad_width - filter_x; 1513 out_x_loop_end_unclampled = pad_width + input_width - filter_x; 1514 } 1515 // The kernel will have to iterate on the segment of the 1516 // output row that starts at out_x_loop_start and out_x_loop_end. 1517 const int out_x_loop_start = 1518 std::max(out_x_buffer_start, out_x_loop_start_unclampled); 1519 const int out_x_loop_end = 1520 std::min(out_x_buffer_end, out_x_loop_end_unclampled); 1521 1522 int32* acc_buffer_ptr = 1523 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 1524 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 1525 const uint8* input_ptr = input_data + in_x_origin * input_depth; 1526 const int num_output_pixels = out_x_loop_end - out_x_loop_start; 1527 QuantizedDepthwiseConvKernel< 1528 kAllowStrided, kFixedInputDepth, 1529 kFixedDepthMultiplier>::Run(num_output_pixels, input_depth, 1530 depth_multiplier, input_ptr, input_offset, 1531 input_ptr_increment, filter_base_ptr, 1532 filter_offset, acc_buffer_ptr); 1533 filter_base_ptr += output_depth; 1534 } 1535 } 1536 1537 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized. 1538 inline void QuantizedDepthwiseConvAccumRowGeneric( 1539 int stride, int input_depth, int input_width, const uint8* input_data, 1540 int16 input_offset, int pad_width, int depth_multiplier, int filter_width, 1541 const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, 1542 int out_x_buffer_end, int output_depth, int32* acc_buffer) { 1543 gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); 1544 #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 1545 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 1546 LOG(FATAL) 1547 << "\n\n" 1548 << "*****************************************************************\n" 1549 << "* This tfmini inference code was about to use the slow generic\n" 1550 << "* fallback implementation for a DepthwiseConv op, and we want you\n" 1551 << "* to be aware of that so that you will know why you get terrible\n" 1552 << "* performance.\n" 1553 << "*\n" 1554 << "* If you would like to carry on with the slow code, compile\n" 1555 << "* with this preprocessor token defined:\n" 1556 << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n" 1557 << "*\n" 1558 << "* The right thing to do, if you care about performance, is to add\n" 1559 << "* a new DepthwiseConv kernel to tfmini to cover your case.\n" 1560 << "* The relevant parameters defining your case are:\n" 1561 << "* stride = " << stride << "\n" 1562 << "* input_depth = " << input_depth << "\n" 1563 << "* depth_multiplier = " << depth_multiplier << "\n" 1564 << "*\n" 1565 << "* Please do not hesitate to contact benoitjacob@ with this\n" 1566 << "* information.\n" 1567 << "*****************************************************************\n"; 1568 #endif // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 1569 #endif // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 1570 const uint8* filter_base_ptr = filter_data; 1571 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 1572 const int out_x_loop_start = std::max( 1573 out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); 1574 const int out_x_loop_end = 1575 std::min(out_x_buffer_end, 1576 (pad_width + input_width - filter_x + stride - 1) / stride); 1577 1578 int32* acc_buffer_ptr = 1579 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 1580 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 1581 const uint8* input_ptr = input_data + in_x_origin * input_depth; 1582 const int input_ptr_increment = (stride - 1) * input_depth; 1583 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { 1584 const uint8* filter_ptr = filter_base_ptr; 1585 for (int ic = 0; ic < input_depth; ++ic) { 1586 const int16 input_val = *input_ptr++ + input_offset; 1587 for (int m = 0; m < depth_multiplier; m++) { 1588 const int16 filter_val = *filter_ptr++ + filter_offset; 1589 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 1590 } 1591 } 1592 input_ptr += input_ptr_increment; 1593 } 1594 filter_base_ptr += output_depth; 1595 } 1596 } 1597 1598 // Initializes the accumulator buffer with bias values. 1599 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, 1600 const int32* bias_data, 1601 int32* acc_buffer) { 1602 int i = 0; 1603 #ifdef USE_NEON 1604 if (output_depth == 1) { 1605 const int32x4_t b = vdupq_n_s32(bias_data[0]); 1606 for (; i <= num_output_pixels - 16; i += 16) { 1607 vst1q_s32(acc_buffer + i + 0, b); 1608 vst1q_s32(acc_buffer + i + 4, b); 1609 vst1q_s32(acc_buffer + i + 8, b); 1610 vst1q_s32(acc_buffer + i + 12, b); 1611 } 1612 for (; i <= num_output_pixels - 4; i += 4) { 1613 vst1q_s32(acc_buffer + i, b); 1614 } 1615 } else if (output_depth == 2) { 1616 int32x4_t b = vdupq_n_s32(bias_data[0]); 1617 b = vsetq_lane_s32(bias_data[1], b, 1); 1618 b = vsetq_lane_s32(bias_data[1], b, 3); 1619 for (; i <= num_output_pixels - 8; i += 8) { 1620 vst1q_s32(acc_buffer + 2 * i + 0, b); 1621 vst1q_s32(acc_buffer + 2 * i + 4, b); 1622 vst1q_s32(acc_buffer + 2 * i + 8, b); 1623 vst1q_s32(acc_buffer + 2 * i + 12, b); 1624 } 1625 for (; i <= num_output_pixels - 2; i += 2) { 1626 vst1q_s32(acc_buffer + 2 * i, b); 1627 } 1628 } else if (output_depth == 4) { 1629 const int32x4_t b = vld1q_s32(bias_data); 1630 for (; i <= num_output_pixels - 4; i += 4) { 1631 vst1q_s32(acc_buffer + 4 * i + 0, b); 1632 vst1q_s32(acc_buffer + 4 * i + 4, b); 1633 vst1q_s32(acc_buffer + 4 * i + 8, b); 1634 vst1q_s32(acc_buffer + 4 * i + 12, b); 1635 } 1636 for (; i < num_output_pixels; i++) { 1637 vst1q_s32(acc_buffer + 4 * i, b); 1638 } 1639 } else if (output_depth == 8) { 1640 const int32x4_t b0 = vld1q_s32(bias_data); 1641 const int32x4_t b1 = vld1q_s32(bias_data + 4); 1642 for (; i <= num_output_pixels - 2; i += 2) { 1643 vst1q_s32(acc_buffer + 8 * i + 0, b0); 1644 vst1q_s32(acc_buffer + 8 * i + 4, b1); 1645 vst1q_s32(acc_buffer + 8 * i + 8, b0); 1646 vst1q_s32(acc_buffer + 8 * i + 12, b1); 1647 } 1648 for (; i < num_output_pixels; i++) { 1649 vst1q_s32(acc_buffer + 8 * i + 0, b0); 1650 vst1q_s32(acc_buffer + 8 * i + 4, b1); 1651 } 1652 } else if (output_depth == 16) { 1653 const int32x4_t b0 = vld1q_s32(bias_data); 1654 const int32x4_t b1 = vld1q_s32(bias_data + 4); 1655 const int32x4_t b2 = vld1q_s32(bias_data + 8); 1656 const int32x4_t b3 = vld1q_s32(bias_data + 12); 1657 for (; i < num_output_pixels; i++) { 1658 vst1q_s32(acc_buffer + 16 * i + 0, b0); 1659 vst1q_s32(acc_buffer + 16 * i + 4, b1); 1660 vst1q_s32(acc_buffer + 16 * i + 8, b2); 1661 vst1q_s32(acc_buffer + 16 * i + 12, b3); 1662 } 1663 } 1664 #endif 1665 for (; i < num_output_pixels; i++) { 1666 memcpy(acc_buffer + i * output_depth, bias_data, 1667 sizeof(acc_buffer[0]) * output_depth); 1668 } 1669 } 1670 1671 inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 1672 int32 input_offset, const uint8* filter_data, 1673 const Dims<4>& filter_dims, int32 filter_offset, 1674 const int32* bias_data, const Dims<4>& bias_dims, 1675 int stride_width, int stride_height, int pad_width, 1676 int pad_height, int depth_multiplier, 1677 int32 output_offset, int32 output_multiplier, 1678 int output_shift, int32 output_activation_min, 1679 int32 output_activation_max, uint8* output_data, 1680 const Dims<4>& output_dims) { 1681 gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); 1682 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1683 1684 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1685 const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); 1686 const int input_height = ArraySize(input_dims, 2); 1687 const int input_width = ArraySize(input_dims, 1); 1688 const int input_depth = ArraySize(input_dims, 0); 1689 const int filter_height = ArraySize(filter_dims, 2); 1690 const int filter_width = ArraySize(filter_dims, 1); 1691 const int output_height = ArraySize(output_dims, 2); 1692 const int output_width = ArraySize(output_dims, 1); 1693 TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); 1694 1695 static const int kAccBufferMaxSize = 2048; 1696 int32 acc_buffer[kAccBufferMaxSize]; 1697 TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth); 1698 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; 1699 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; 1700 TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, 1701 kAccBufferActualSize); 1702 TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); 1703 TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1); 1704 1705 // row_accum_func will point to the core accumulation function to be used 1706 // for this DepthwiseConv op. 1707 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric); 1708 row_accum_func_t row_accum_func = nullptr; 1709 1710 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 1711 FIXED_DEPTH_MULTIPLIER) \ 1712 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ 1713 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ 1714 depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ 1715 row_accum_func = \ 1716 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 1717 FIXED_DEPTH_MULTIPLIER>; \ 1718 } 1719 1720 #ifdef USE_NEON 1721 // We go over our list of kernels by decreasing order of preference 1722 // for the cases where multiple kernels could apply. 1723 1724 // Start with the fastest kernels: AllowStrided=false, fixed input depth. 1725 1726 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) 1727 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) 1728 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) 1729 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) 1730 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) 1731 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) 1732 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) 1733 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) 1734 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) 1735 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1) 1736 1737 // Next come the strided kernels: AllowStrided=true, fixed input depth. 1738 // They are a bit less efficient, but allow stride!=1. 1739 1740 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) 1741 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) 1742 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) 1743 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) 1744 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) 1745 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) 1746 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) 1747 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) 1748 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) 1749 1750 // Finally, the kernels allowing a variable input depth, 1751 // these are the least efficient but most general kernels. 1752 1753 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) 1754 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) 1755 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) 1756 #endif // USE_NEON 1757 1758 // No matching fast kernel found, use slow fallback. 1759 if (!row_accum_func) { 1760 row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; 1761 } 1762 1763 #undef TFMINI_USE_DEPTHWISECONV_KERNEL 1764 1765 // Now that we have determined row_accum_func, we can start work. 1766 uint8* output_ptr = output_data; 1767 for (int b = 0; b < batches; ++b) { 1768 for (int out_y = 0; out_y < output_height; ++out_y) { 1769 const int in_y_origin = (out_y * stride_height) - pad_height; 1770 const int filter_y_start = std::max(0, -in_y_origin); 1771 const int filter_y_end = 1772 std::min(filter_height, input_height - in_y_origin); 1773 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; 1774 out_x_buffer_start += kOutputPixelsInAccBuffer) { 1775 const int out_x_buffer_end = std::min( 1776 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); 1777 // We call a 'pixel' a group of activation that share all but the 1778 // 'depth'/'channel' coordinate. num_output_pixels is the number of 1779 // output pixels that we will accumulate in this loop iteration. 1780 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; 1781 // Initialize our local accumulator with the bias values, so we don't 1782 // have to add them later. 1783 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, 1784 acc_buffer); 1785 // Accumulation loop. Most of the time should be spent in here. 1786 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1787 ++filter_y) { 1788 const int in_y = in_y_origin + filter_y; 1789 row_accum_func( 1790 stride_width, input_depth, input_width, 1791 input_data + in_y * input_dims.strides[2] + 1792 b * input_dims.strides[3], 1793 input_offset, pad_width, depth_multiplier, filter_width, 1794 filter_data + filter_y * filter_dims.strides[2], filter_offset, 1795 out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); 1796 } 1797 // Finished accumulating int32 values. Now need to convert them to 1798 // the final 8bit form and store them. 1799 gemmlowp::ScopedProfilingLabel label("downquantize+store"); 1800 const int num_output_values = output_depth * num_output_pixels; 1801 int i = 0; 1802 #ifdef USE_NEON 1803 using gemmlowp::RoundingDivideByPOT; 1804 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); 1805 const int32x4_t output_activation_min_vec = 1806 vdupq_n_s32(output_activation_min); 1807 const int32x4_t output_activation_max_vec = 1808 vdupq_n_s32(output_activation_max); 1809 // Handle 16 values at once. 1810 // This allows us to issue 4 mutually independent int32 1811 // multiplications (vqrdmulh), which should alleviate most of their 1812 // high latency. 1813 for (; i <= num_output_values - 16; i += 16) { 1814 int32x4_t acc[4]; 1815 for (int j = 0; j < 4; j++) { 1816 acc[j] = vld1q_s32(acc_buffer + i + 4 * j); 1817 } 1818 1819 // Fixed-point multiplication. 1820 for (int j = 0; j < 4; j++) { 1821 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); 1822 } 1823 for (int j = 0; j < 4; j++) { 1824 acc[j] = RoundingDivideByPOT(acc[j], output_shift); 1825 } 1826 // Add the output offset. 1827 for (int j = 0; j < 4; j++) { 1828 acc[j] = vaddq_s32(acc[j], output_offset_vec); 1829 } 1830 // Apply the activation function. 1831 for (int j = 0; j < 4; j++) { 1832 acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); 1833 } 1834 for (int j = 0; j < 4; j++) { 1835 acc[j] = vminq_s32(acc[j], output_activation_max_vec); 1836 } 1837 // Saturating cast to uint8 and store to destination. 1838 int16x4_t acc_s16[4]; 1839 for (int j = 0; j < 4; j++) { 1840 acc_s16[j] = vqmovn_s32(acc[j]); 1841 } 1842 const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); 1843 const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); 1844 const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); 1845 const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); 1846 vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); 1847 output_ptr += 16; 1848 } 1849 // Handle 8 values at once. 1850 // Not as good as 16 (now we're only issuing 2 mutually independent 1851 // vqrdmulh instructions, so we're probably paying for their high 1852 // latency). 1853 for (; i <= num_output_values - 8; i += 8) { 1854 int32x4_t acc0 = vld1q_s32(acc_buffer + i); 1855 int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); 1856 // Fixed-point multiplication. 1857 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); 1858 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); 1859 // Rounding right shift. 1860 acc0 = RoundingDivideByPOT(acc0, output_shift); 1861 acc1 = RoundingDivideByPOT(acc1, output_shift); 1862 // Add the output offset. 1863 acc0 = vaddq_s32(acc0, output_offset_vec); 1864 acc1 = vaddq_s32(acc1, output_offset_vec); 1865 // Apply the activation function. 1866 acc0 = vmaxq_s32(acc0, output_activation_min_vec); 1867 acc1 = vmaxq_s32(acc1, output_activation_min_vec); 1868 acc0 = vminq_s32(acc0, output_activation_max_vec); 1869 acc1 = vminq_s32(acc1, output_activation_max_vec); 1870 // Saturating cast to uint8 and store to destination. 1871 const int16x4_t acc0_s16 = vqmovn_s32(acc0); 1872 const int16x4_t acc1_s16 = vqmovn_s32(acc1); 1873 const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); 1874 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 1875 vst1_u8(output_ptr, res_u8); 1876 output_ptr += 8; 1877 } 1878 // Handle 4 values at once. Now we're paying the full price of the 1879 // high latency of vqrdmulh. Also, storing only 4 bytes at the end 1880 // (without any alignment) can only be done 1 byte at a time. 1881 // Yet, that is still worth doing to minimize the amount of leftover 1882 // that will have to go through the very slow scalar code. 1883 for (; i <= num_output_values - 4; i += 4) { 1884 int32x4_t acc = vld1q_s32(acc_buffer + i); 1885 // Fixed-point multiplication. 1886 acc = vqrdmulhq_n_s32(acc, output_multiplier); 1887 // Rounding right shift. 1888 acc = RoundingDivideByPOT(acc, output_shift); 1889 // Add the output offset. 1890 acc = vaddq_s32(acc, output_offset_vec); 1891 // Apply the activation function. 1892 acc = vmaxq_s32(acc, output_activation_min_vec); 1893 acc = vminq_s32(acc, output_activation_max_vec); 1894 // Saturating cast to uint8 and store to destination. 1895 const int16x4_t acc_s16 = vqmovn_s32(acc); 1896 const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); 1897 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 1898 vst1_lane_u8(output_ptr + 0, res_u8, 0); 1899 vst1_lane_u8(output_ptr + 1, res_u8, 1); 1900 vst1_lane_u8(output_ptr + 2, res_u8, 2); 1901 vst1_lane_u8(output_ptr + 3, res_u8, 3); 1902 output_ptr += 4; 1903 } 1904 #endif // USE_NEON 1905 1906 // Handle leftover values, one by one. This is very slow. 1907 for (; i < num_output_values; i++) { 1908 int32 acc = acc_buffer[i]; 1909 acc = MultiplyByQuantizedMultiplierSmallerThanOne( 1910 acc, output_multiplier, output_shift); 1911 acc += output_offset; 1912 acc = std::max(acc, output_activation_min); 1913 acc = std::min(acc, output_activation_max); 1914 *output_ptr++ = static_cast<uint8>(acc); 1915 } 1916 } 1917 } 1918 } 1919 } 1920 1921 // Legacy, for compatibility with old checked-in code. 1922 template <FusedActivationFunctionType Ac> 1923 void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 1924 int32 input_offset, const uint8* filter_data, 1925 const Dims<4>& filter_dims, int32 filter_offset, 1926 const int32* bias_data, const Dims<4>& bias_dims, 1927 int stride_width, int stride_height, int pad_width, 1928 int pad_height, int depth_multiplier, int32 output_offset, 1929 int32 output_multiplier, int output_shift, 1930 int32 output_activation_min, int32 output_activation_max, 1931 uint8* output_data, const Dims<4>& output_dims) { 1932 if (Ac == FusedActivationFunctionType::kNone) { 1933 TFLITE_DCHECK_EQ(output_activation_min, 0); 1934 TFLITE_DCHECK_EQ(output_activation_max, 255); 1935 } 1936 DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, 1937 filter_offset, bias_data, bias_dims, stride_width, 1938 stride_height, pad_width, pad_height, depth_multiplier, 1939 output_offset, output_multiplier, output_shift, 1940 output_activation_min, output_activation_max, output_data, 1941 output_dims); 1942 } 1943 1944 // Legacy, for compatibility with old checked-in code. 1945 template <FusedActivationFunctionType Ac> 1946 void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 1947 int32 input_offset, const uint8* filter_data, 1948 const Dims<4>& filter_dims, int32 filter_offset, 1949 const int32* bias_data, const Dims<4>& bias_dims, int stride, 1950 int pad_width, int pad_height, int depth_multiplier, 1951 int32 output_offset, int32 output_multiplier, 1952 int output_shift, int32 output_activation_min, 1953 int32 output_activation_max, uint8* output_data, 1954 const Dims<4>& output_dims) { 1955 DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, 1956 filter_dims, filter_offset, bias_data, bias_dims, stride, 1957 stride, pad_width, pad_height, depth_multiplier, 1958 output_offset, output_multiplier, output_shift, 1959 output_activation_min, output_activation_max, output_data, 1960 output_dims); 1961 } 1962 1963 } // namespace optimized_ops 1964 } // namespace tflite 1965 1966 #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 1967