1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 18 #define ANDROID_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 19 20 #include "fixedpoint.h" 21 #include "gemmlowp.h" 22 #include "../common.h" 23 #include "../types.h" 24 25 namespace android { 26 namespace nn { 27 namespace optimized_ops { 28 29 // Implementation of quantized DepthwiseConv 30 31 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 32 struct QuantizedDepthwiseConvKernel {}; 33 34 #ifdef USE_NEON 35 template <> 36 struct QuantizedDepthwiseConvKernel<true, 8, 2> { 37 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 38 const uint8* input_ptr, int16 input_offset, 39 int input_ptr_increment, const uint8* filter_ptr, 40 int16 filter_offset, int32* acc_buffer_ptr) { 41 // Load the filters, add filter_offset. 42 uint8x8x2_t filter_u8; 43 filter_u8.val[0] = vld1_u8(filter_ptr); 44 filter_u8.val[1] = vld1_u8(filter_ptr + 8); 45 int16x8_t filter[2]; 46 for (int i = 0; i < 2; i++) { 47 filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), 48 vdupq_n_s16(filter_offset)); 49 } 50 // Handle one output pixel at a time. 51 for (int outp = 0; outp < num_output_pixels; outp++) { 52 // Load the accumulators from acc_buffer 53 int32x4x2_t acc[2]; 54 for (int i = 0; i < 2; i++) { 55 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 56 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 57 } 58 // Load the inputs, add input_offset. 59 const uint8x8_t input_u8 = vld1_u8(input_ptr); 60 input_ptr += input_ptr_increment; 61 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 62 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 63 // Duplicate the input values, 2-fold 64 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 65 // Multiply-accumulate 66 for (int i = 0; i < 2; i++) { 67 acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), 68 vget_low_s16(input_dup2.val[i])); 69 acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), 70 vget_high_s16(input_dup2.val[i])); 71 } 72 // Store the accumulators back to acc_buffer 73 for (int i = 0; i < 2; i++) { 74 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 75 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 76 } 77 acc_buffer_ptr += 16; 78 } 79 } 80 }; 81 82 template <> 83 struct QuantizedDepthwiseConvKernel<false, 8, 1> { 84 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 85 const uint8* input_ptr, int16 input_offset, 86 int input_ptr_increment, const uint8* filter_ptr, 87 int16 filter_offset, int32* acc_buffer_ptr) { 88 // Load the filters, add filter_offset. 89 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 90 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 91 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 92 93 int outp = 0; 94 // Handle 2 output pixels at a time. 95 for (; outp <= num_output_pixels - 2; outp += 2) { 96 // Load the accumulators from acc_buffer. 97 int32x4_t acc[4]; 98 for (int i = 0; i < 4; i++) { 99 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 100 } 101 // Load the inputs, add input_offset. 102 uint8x8_t input_u8[2]; 103 for (int i = 0; i < 2; i++) { 104 input_u8[i] = vld1_u8(input_ptr + 8 * i); 105 } 106 input_ptr += 16; 107 int16x8_t input[2]; 108 for (int i = 0; i < 2; i++) { 109 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 110 } 111 for (int i = 0; i < 2; i++) { 112 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 113 } 114 // Multiply-accumulate. 115 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); 116 acc[1] = 117 vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); 118 acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); 119 acc[3] = 120 vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); 121 // Store the accumulators back to acc_buffer 122 for (int i = 0; i < 4; i++) { 123 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 124 } 125 acc_buffer_ptr += 16; 126 } 127 // Handle 1 output pixel at a time. 128 for (; outp < num_output_pixels; outp++) { 129 // Load the accumulators from acc_buffer. 130 int32x4_t acc[2]; 131 acc[0] = vld1q_s32(acc_buffer_ptr); 132 acc[1] = vld1q_s32(acc_buffer_ptr + 4); 133 134 // Load the inputs, add input_offset. 135 const uint8x8_t input_u8 = vld1_u8(input_ptr); 136 input_ptr += 8; 137 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 138 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 139 // Multiply-accumulate. 140 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); 141 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); 142 // Store the accumulators back to acc_buffer 143 vst1q_s32(acc_buffer_ptr, acc[0]); 144 vst1q_s32(acc_buffer_ptr + 4, acc[1]); 145 acc_buffer_ptr += 8; 146 } 147 } 148 }; 149 150 template <> 151 struct QuantizedDepthwiseConvKernel<false, 4, 2> { 152 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 153 const uint8* input_ptr, int16 input_offset, 154 int input_ptr_increment, const uint8* filter_ptr, 155 int16 filter_offset, int32* acc_buffer_ptr) { 156 // Load the filters, add filter_offset. 157 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 158 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 159 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 160 161 int outp = 0; 162 // Handle 2 output pixels at a time. 163 for (; outp <= num_output_pixels - 2; outp += 2) { 164 // Load the accumulators from acc_buffer 165 int32x4_t acc[4]; 166 for (int i = 0; i < 4; i++) { 167 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 168 } 169 // Load the inputs, add input_offset. 170 const uint8x8_t input_u8 = vld1_u8(input_ptr); 171 input_ptr += 8; 172 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 173 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 174 // Duplicate the input values, 2-fold 175 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 176 // Multiply-accumulate 177 for (int i = 0; i < 2; i++) { 178 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), 179 vget_low_s16(input_dup2.val[i])); 180 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), 181 vget_high_s16(input_dup2.val[i])); 182 } 183 // Store the accumulators back to acc_buffer 184 for (int i = 0; i < 4; i++) { 185 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 186 } 187 acc_buffer_ptr += 16; 188 } 189 // Handle one output pixel at a time. 190 for (; outp < num_output_pixels; outp++) { 191 // Load the accumulators from acc_buffer 192 int32x4_t acc[2]; 193 for (int i = 0; i < 2; i++) { 194 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 195 } 196 // Load the inputs, add input_offset. 197 uint8x8_t input_u8 = vdup_n_u8(0); 198 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 199 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 200 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 201 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 202 input_ptr += 4; 203 const int16x4_t input_s16 = 204 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 205 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 206 // Duplicate the input values, 2-fold 207 const int16x4x2_t input_dup2 = vzip_s16(input, input); 208 // Multiply-accumulate 209 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); 210 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); 211 // Store the accumulators back to acc_buffer 212 for (int i = 0; i < 2; i++) { 213 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 214 } 215 acc_buffer_ptr += 8; 216 } 217 } 218 }; 219 220 template <> 221 struct QuantizedDepthwiseConvKernel<false, 2, 8> { 222 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 223 const uint8* input_ptr, int16 input_offset, 224 int input_ptr_increment, const uint8* filter_ptr, 225 int16 filter_offset, int32* acc_buffer_ptr) { 226 // Load the filters, add filter_offset. 227 int16x8_t filter[2]; 228 for (int i = 0; i < 2; i++) { 229 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); 230 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 231 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 232 } 233 int outp = 0; 234 // Handle two output pixels at a time. 235 for (; outp <= num_output_pixels - 2; outp += 2) { 236 // Load the accumulators from acc_buffer. 237 int32x4_t acc[8]; 238 for (int i = 0; i < 8; i++) { 239 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 240 } 241 // Load the inputs, add input_offset. 242 uint8x8_t input_u8 = vdup_n_u8(0); 243 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 244 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 245 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 246 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 247 input_ptr += 4; 248 const int16x4_t input_s16 = 249 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 250 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 251 // Multiply-accumulate. 252 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 253 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); 254 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); 255 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); 256 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); 257 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); 258 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); 259 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); 260 // Store the accumulators back to acc_buffer. 261 for (int i = 0; i < 8; i++) { 262 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 263 } 264 acc_buffer_ptr += 32; 265 } 266 // Handle one output pixel at a time. 267 for (; outp < num_output_pixels; outp++) { 268 // Load the accumulators from acc_buffer. 269 int32x4_t acc[4]; 270 for (int i = 0; i < 4; i++) { 271 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 272 } 273 // Load the inputs, add input_offset. 274 uint8x8_t input_u8 = vdup_n_u8(0); 275 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 276 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 277 input_ptr += 2; 278 const int16x4_t input_s16 = 279 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 280 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 281 282 // Multiply-accumulate. 283 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 284 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); 285 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); 286 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); 287 288 // Store the accumulators back to acc_buffer. 289 for (int i = 0; i < 4; i++) { 290 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 291 } 292 acc_buffer_ptr += 16; 293 } 294 } 295 }; 296 297 template <> 298 struct QuantizedDepthwiseConvKernel<false, 2, 2> { 299 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 300 const uint8* input_ptr, int16 input_offset, 301 int input_ptr_increment, const uint8* filter_ptr, 302 int16 filter_offset, int32* acc_buffer_ptr) { 303 // Load the filters, add filter_offset. 304 uint8x8_t filter_u8 = vdup_n_u8(0); 305 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 306 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 307 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 308 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 309 const int16x4_t filter_s16 = 310 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 311 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 312 313 int outp = 0; 314 // Handle 4 output pixels at a time. 315 for (; outp <= num_output_pixels - 4; outp += 4) { 316 // Load the accumulators from acc_buffer 317 int32x4_t acc[4]; 318 for (int i = 0; i < 4; i++) { 319 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 320 } 321 322 // Load the inputs, add input_offset. 323 const uint8x8_t input_u8 = vld1_u8(input_ptr); 324 input_ptr += 8; 325 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 326 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 327 // Duplicate the input values, 2-fold 328 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 329 // Multiply-accumulate 330 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); 331 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); 332 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); 333 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); 334 // Store the accumulators back to acc_buffer 335 for (int i = 0; i < 4; i++) { 336 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 337 } 338 acc_buffer_ptr += 16; 339 } 340 // Handle one output pixel at a time. 341 for (; outp < num_output_pixels; outp++) { 342 // Load the accumulators from acc_buffer 343 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 344 345 uint8x8_t input_u8 = vdup_n_u8(0); 346 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 347 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 348 input_ptr += 2; 349 const int16x4_t input_s16 = 350 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 351 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 352 // Duplicate the input values, 2-fold 353 const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; 354 // Multiply-accumulate 355 acc = vmlal_s16(acc, filter, input_dup2); 356 // Store the accumulators back to acc_buffer 357 vst1q_s32(acc_buffer_ptr, acc); 358 acc_buffer_ptr += 4; 359 } 360 } 361 }; 362 363 template <> 364 struct QuantizedDepthwiseConvKernel<false, 2, 1> { 365 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 366 const uint8* input_ptr, int16 input_offset, 367 int input_ptr_increment, const uint8* filter_ptr, 368 int16 filter_offset, int32* acc_buffer_ptr) { 369 // Load the filters, add filter_offset. 370 uint8x8_t filter_u8 = vdup_n_u8(0); 371 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 372 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 373 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 374 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 375 const int16x4_t filter_s16 = 376 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 377 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 378 379 int outp = 0; 380 // Handle 8 output pixels at a time. 381 for (; outp <= num_output_pixels - 8; outp += 8) { 382 // Load the accumulators from acc_buffer. 383 int32x4_t acc[4]; 384 for (int i = 0; i < 4; i++) { 385 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 386 } 387 // Load the inputs, add input_offset. 388 uint8x8_t input_u8[2]; 389 for (int i = 0; i < 2; i++) { 390 input_u8[i] = vld1_u8(input_ptr + 8 * i); 391 } 392 input_ptr += 16; 393 int16x8_t input[2]; 394 for (int i = 0; i < 2; i++) { 395 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 396 } 397 for (int i = 0; i < 2; i++) { 398 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 399 } 400 401 // Multiply-accumulate. 402 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); 403 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); 404 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); 405 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); 406 // Store the accumulators back to acc_buffer. 407 for (int i = 0; i < 4; i++) { 408 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 409 } 410 acc_buffer_ptr += 16; 411 } 412 // Handle 4 output pixels at a time. 413 for (; outp <= num_output_pixels - 4; outp += 4) { 414 // Load the accumulators from acc_buffer. 415 int32x4_t acc[2]; 416 for (int i = 0; i < 2; i++) { 417 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 418 } 419 // Load the inputs, add input_offset. 420 const uint8x8_t input_u8 = vld1_u8(input_ptr); 421 input_ptr += 8; 422 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 423 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 424 425 // Multiply-accumulate. 426 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); 427 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); 428 // Store the accumulators back to acc_buffer. 429 for (int i = 0; i < 2; i++) { 430 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 431 } 432 acc_buffer_ptr += 8; 433 } 434 // Handle 2 output pixels at a time. 435 for (; outp <= num_output_pixels - 2; outp += 2) { 436 // Load the accumulators from acc_buffer. 437 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 438 // Load the inputs, add input_offset. 439 uint8x8_t input_u8 = vdup_n_u8(0); 440 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 441 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 442 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 443 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 444 input_ptr += 4; 445 const int16x4_t input_s16 = 446 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 447 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 448 449 // Multiply-accumulate. 450 acc = vmlal_s16(acc, filter, input); 451 // Store the accumulators back to acc_buffer. 452 vst1q_s32(acc_buffer_ptr, acc); 453 acc_buffer_ptr += 4; 454 } 455 // Handle 1 output pixel at a time. 456 for (; outp < num_output_pixels; outp++) { 457 // Load the accumulators from acc_buffer. 458 int32x2_t acc = vld1_s32(acc_buffer_ptr); 459 // Load the inputs, add input_offset. 460 uint8x8_t input_u8 = vdup_n_u8(0); 461 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 462 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 463 input_ptr += 2; 464 const int16x4_t input_s16 = 465 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 466 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 467 468 // Multiply-accumulate. 469 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); 470 // Store the accumulators back to acc_buffer. 471 vst1_s32(acc_buffer_ptr, acc); 472 acc_buffer_ptr += 2; 473 } 474 } 475 }; 476 477 template <> 478 struct QuantizedDepthwiseConvKernel<false, 1, 2> { 479 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 480 const uint8* input_ptr, int16 input_offset, 481 int input_ptr_increment, const uint8* filter_ptr, 482 int16 filter_offset, int32* acc_buffer_ptr) { 483 // Load the filters, add filter_offset. 484 uint8x8_t filter_u8 = vdup_n_u8(0); 485 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 486 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 487 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 488 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 489 const int16x4_t filter_s16 = 490 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 491 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 492 493 int outp = 0; 494 // Handle 8 output pixels at a time. 495 for (; outp <= num_output_pixels - 8; outp += 8) { 496 // Load the accumulators from acc_buffer 497 int32x4_t acc[4]; 498 for (int i = 0; i < 4; i++) { 499 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 500 } 501 502 // Load the inputs, add input_offset. 503 const uint8x8_t input_u8 = vld1_u8(input_ptr); 504 input_ptr += 8; 505 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 506 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 507 // Duplicate the input values, 2-fold 508 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 509 // Multiply-accumulate 510 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); 511 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); 512 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); 513 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); 514 // Store the accumulators back to acc_buffer 515 for (int i = 0; i < 4; i++) { 516 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 517 } 518 acc_buffer_ptr += 16; 519 } 520 // Handle one output pixel at a time. 521 for (; outp < num_output_pixels; outp++) { 522 // Load the accumulators from acc_buffer 523 int32x2_t acc = vld1_s32(acc_buffer_ptr); 524 525 // Load the inputs, add input_offset. 526 const uint32 input = *input_ptr++ + input_offset; 527 528 // Multiply-accumulate 529 acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); 530 // Store the accumulators back to acc_buffer 531 vst1_s32(acc_buffer_ptr, acc); 532 acc_buffer_ptr += 2; 533 } 534 } 535 }; 536 537 template <> 538 struct QuantizedDepthwiseConvKernel<false, 1, 4> { 539 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 540 const uint8* input_ptr, int16 input_offset, 541 int input_ptr_increment, const uint8* filter_ptr, 542 int16 filter_offset, int32* acc_buffer_ptr) { 543 // Load the filters, add filter_offset. 544 uint8x8_t filter_u8 = vdup_n_u8(0); 545 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 546 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 547 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 548 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 549 const int16x4_t filter_s16 = 550 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 551 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 552 553 int outp = 0; 554 // Handle 8 output pixels at a time. 555 for (; outp <= num_output_pixels - 8; outp += 8) { 556 // Load the accumulators from acc_buffer 557 int32x4_t acc[8]; 558 for (int i = 0; i < 8; i++) { 559 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 560 } 561 562 // Load the inputs, add input_offset. 563 uint8x8_t input_u8 = vld1_u8(input_ptr); 564 input_ptr += 8; 565 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 566 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 567 568 // Multiply-accumulate 569 acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); 570 acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); 571 acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); 572 acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); 573 acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); 574 acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); 575 acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); 576 acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); 577 578 // Store the accumulators back to acc_buffer 579 for (int i = 0; i < 8; i++) { 580 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 581 } 582 acc_buffer_ptr += 32; 583 } 584 // Handle 4 output pixels at a time. 585 for (; outp <= num_output_pixels - 4; outp += 4) { 586 // Load the accumulators from acc_buffer 587 int32x4_t acc[4]; 588 for (int i = 0; i < 4; i++) { 589 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 590 } 591 592 // Load the inputs, add input_offset. 593 uint8x8_t input_u8 = vdup_n_u8(0); 594 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 595 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 596 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 597 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 598 input_ptr += 4; 599 const int16x4_t input_s16 = 600 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 601 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 602 603 // Multiply-accumulate 604 acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); 605 acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); 606 acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); 607 acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); 608 609 // Store the accumulators back to acc_buffer 610 for (int i = 0; i < 4; i++) { 611 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 612 } 613 acc_buffer_ptr += 16; 614 } 615 // Handle one output pixel at a time. 616 for (; outp < num_output_pixels; outp++) { 617 // Load the accumulators from acc_buffer 618 int32x4_t acc = vld1q_s32(acc_buffer_ptr); 619 620 // Load the inputs, add input_offset. 621 const uint32 input = *input_ptr++ + input_offset; 622 623 // Multiply-accumulate 624 acc = vmlal_n_s16(acc, filter, input); 625 // Store the accumulators back to acc_buffer 626 vst1q_s32(acc_buffer_ptr, acc); 627 acc_buffer_ptr += 4; 628 } 629 } 630 }; 631 632 template <> 633 struct QuantizedDepthwiseConvKernel<false, 4, 1> { 634 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 635 const uint8* input_ptr, int16 input_offset, 636 int input_ptr_increment, const uint8* filter_ptr, 637 int16 filter_offset, int32* acc_buffer_ptr) { 638 // Load the filters, add filter_offset. 639 uint8x8_t filter_u8 = vdup_n_u8(0); 640 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 641 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 642 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 643 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 644 const int16x4_t filter_s16 = 645 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 646 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 647 648 int outp = 0; 649 // Handle 4 output pixels at a time. 650 for (; outp <= num_output_pixels - 4; outp += 4) { 651 // Load the accumulators from acc_buffer 652 int32x4_t acc[4]; 653 for (int i = 0; i < 4; i++) { 654 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 655 } 656 // Load the inputs, add input_offset. 657 int16x8_t input[2]; 658 for (int i = 0; i < 2; i++) { 659 const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); 660 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 661 input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 662 } 663 input_ptr += 16; 664 // Multiply-accumulate 665 for (int i = 0; i < 2; i++) { 666 acc[2 * i + 0] = 667 vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); 668 acc[2 * i + 1] = 669 vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); 670 } 671 // Store the accumulators back to acc_buffer 672 for (int i = 0; i < 4; i++) { 673 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 674 } 675 acc_buffer_ptr += 16; 676 } 677 // Handle one output pixel at a time. 678 for (; outp < num_output_pixels; outp++) { 679 // Load the accumulators from acc_buffer 680 int32x4_t acc; 681 acc = vld1q_s32(acc_buffer_ptr); 682 683 // Load the inputs, add input_offset. 684 uint8x8_t input_u8 = vdup_n_u8(0); 685 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 686 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 687 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 688 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 689 input_ptr += 4; 690 const int16x4_t input_s16 = 691 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 692 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 693 // Multiply-accumulate 694 acc = vmlal_s16(acc, filter, input); 695 // Store the accumulators back to acc_buffer 696 vst1q_s32(acc_buffer_ptr, acc); 697 acc_buffer_ptr += 4; 698 } 699 } 700 }; 701 702 template <> 703 struct QuantizedDepthwiseConvKernel<false, 4, 4> { 704 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 705 const uint8* input_ptr, int16 input_offset, 706 int input_ptr_increment, const uint8* filter_ptr, 707 int16 filter_offset, int32* acc_buffer_ptr) { 708 // Load the filters, add filter_offset. 709 int16x8_t filter[2]; 710 for (int i = 0; i < 2; i++) { 711 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); 712 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 713 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 714 } 715 716 int outp = 0; 717 // Handle 2 output pixels at a time. 718 for (; outp <= num_output_pixels - 2; outp += 2) { 719 // Load the accumulators from acc_buffer 720 int32x4_t acc[8]; 721 for (int i = 0; i < 8; i++) { 722 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 723 } 724 725 // Load the inputs, add input_offset. 726 uint8x8_t input_u8 = vld1_u8(input_ptr); 727 input_ptr += 8; 728 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 729 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 730 731 // Multiply-accumulate 732 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), 733 vget_low_s16(input), 0); 734 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), 735 vget_low_s16(input), 1); 736 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), 737 vget_low_s16(input), 2); 738 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), 739 vget_low_s16(input), 3); 740 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), 741 vget_high_s16(input), 0); 742 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), 743 vget_high_s16(input), 1); 744 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), 745 vget_high_s16(input), 2); 746 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), 747 vget_high_s16(input), 3); 748 // Store the accumulators back to acc_buffer 749 for (int i = 0; i < 8; i++) { 750 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 751 } 752 acc_buffer_ptr += 32; 753 } 754 // Handle one output pixel at a time. 755 for (; outp < num_output_pixels; outp++) { 756 // Load the accumulators from acc_buffer 757 int32x4_t acc[4]; 758 for (int i = 0; i < 4; i++) { 759 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 760 } 761 762 // Load the inputs, add input_offset. 763 uint8x8_t input_u8 = vdup_n_u8(0); 764 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 765 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 766 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 767 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 768 input_ptr += 4; 769 const int16x4_t input_s16 = 770 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 771 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 772 773 // Multiply-accumulate 774 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 775 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); 776 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); 777 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); 778 // Store the accumulators back to acc_buffer 779 for (int i = 0; i < 4; i++) { 780 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 781 } 782 acc_buffer_ptr += 16; 783 } 784 } 785 }; 786 787 template <> 788 struct QuantizedDepthwiseConvKernel<true, 0, 3> { 789 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 790 const uint8* input_ptr, int16 input_offset, 791 int input_ptr_increment, const uint8* filter_ptr, 792 int16 filter_offset, int32* acc_buffer_ptr) { 793 // We will have to duplicate bytes in a NEON register, 3-fold. 794 // We will do that by register-level table-look-up using VTBL instructions. 795 // Here we prepare the registers containing the table-lookup indices. 796 static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2}, 797 {2, 3, 3, 3, 4, 4, 4, 5}, 798 {5, 5, 6, 6, 6, 7, 7, 7}}; 799 uint8x8_t dup3_indices[3]; 800 for (int i = 0; i < 3; i++) { 801 dup3_indices[i] = vld1_u8(dup3_indices_array[i]); 802 } 803 804 // Handle one output pixel at a time. 805 for (int outp = 0; outp < num_output_pixels; outp++) { 806 const uint8* local_filter_ptr = filter_ptr; 807 const uint8* local_input_ptr = input_ptr; 808 int ic = 0; 809 // Handle 8 input channels at a time. 810 for (; ic <= input_depth - 8; ic += 8) { 811 // Load the filters, add filter_offset. 812 int16x8_t filter[3]; 813 uint8x8x3_t filter_u8; 814 filter_u8.val[0] = vld1_u8(local_filter_ptr); 815 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); 816 filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); 817 local_filter_ptr += 24; 818 for (int i = 0; i < 3; i++) { 819 const int16x8_t filter_s16 = 820 vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); 821 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 822 } 823 // Load the inputs, duplicate 3-fold, add input_offset. 824 const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 825 local_input_ptr += 8; 826 827 uint8x8_t input_u8_dup3[3]; 828 for (int i = 0; i < 3; i++) { 829 input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); 830 } 831 int16x8_t input_dup3[3]; 832 for (int i = 0; i < 3; i++) { 833 const int16x8_t input_s16_dup3 = 834 vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); 835 input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); 836 } 837 // Load the accumulators from acc_buffer 838 int32x4x3_t acc[2]; 839 for (int i = 0; i < 2; i++) { 840 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 841 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 842 acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); 843 } 844 // Multiply-accumulate 845 for (int j = 0; j < 3; j++) { 846 acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), 847 vget_low_s16(filter[j])); 848 acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), 849 vget_high_s16(filter[j])); 850 } 851 // Store the accumulators back to acc_buffer 852 for (int i = 0; i < 2; i++) { 853 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 854 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 855 vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); 856 } 857 acc_buffer_ptr += 24; 858 } 859 // Handle one input channel at a time. 860 for (; ic < input_depth; ic++) { 861 const int16 input_val = *local_input_ptr++ + input_offset; 862 for (int i = 0; i < 3; i++) { 863 const int16 filter_val = local_filter_ptr[i] + filter_offset; 864 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 865 } 866 local_filter_ptr += 3; 867 } 868 input_ptr += input_ptr_increment; 869 } 870 } 871 }; 872 873 template <> 874 struct QuantizedDepthwiseConvKernel<true, 0, 2> { 875 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 876 const uint8* input_ptr, int16 input_offset, 877 int input_ptr_increment, const uint8* filter_ptr, 878 int16 filter_offset, int32* acc_buffer_ptr) { 879 // Handle one output pixel at a time. 880 for (int outp = 0; outp < num_output_pixels; outp++) { 881 const uint8* local_filter_ptr = filter_ptr; 882 const uint8* local_input_ptr = input_ptr; 883 int ic = 0; 884 // Handle 8 input channels at a time. 885 for (; ic <= input_depth - 8; ic += 8) { 886 // Load the filters, add filter_offset. 887 int16x8_t filter[2]; 888 uint8x8x2_t filter_u8; 889 filter_u8.val[0] = vld1_u8(local_filter_ptr); 890 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); 891 local_filter_ptr += 16; 892 for (int i = 0; i < 2; i++) { 893 const int16x8_t filter_s16 = 894 vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); 895 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 896 } 897 // Load the inputs, add input_offset, duplicate 2-fold. 898 const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 899 local_input_ptr += 8; 900 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 901 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 902 const int16x8x2_t input_dup2 = vzipq_s16(input, input); 903 // Load the accumulators from acc_buffer. 904 int32x4x2_t acc[2]; 905 for (int i = 0; i < 2; i++) { 906 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 907 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 908 } 909 // Multiply-accumulate. 910 for (int j = 0; j < 2; j++) { 911 acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), 912 vget_low_s16(input_dup2.val[j])); 913 acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), 914 vget_high_s16(input_dup2.val[j])); 915 } 916 // Store the accumulators back to acc_buffer. 917 for (int i = 0; i < 2; i++) { 918 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 919 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 920 } 921 acc_buffer_ptr += 16; 922 } 923 // Handle one input channel at a time. 924 for (; ic < input_depth; ic++) { 925 // Load the inputs. 926 const int16 input_val = *local_input_ptr++ + input_offset; 927 for (int i = 0; i < 2; i++) { 928 const int16 filter_val = local_filter_ptr[i] + filter_offset; 929 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 930 } 931 local_filter_ptr += 2; 932 } 933 input_ptr += input_ptr_increment; 934 } 935 } 936 }; 937 938 template <> 939 struct QuantizedDepthwiseConvKernel<true, 0, 1> { 940 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 941 const uint8* input_ptr, int16 input_offset, 942 int input_ptr_increment, const uint8* filter_ptr, 943 int16 filter_offset, int32* acc_buffer_ptr) { 944 // Handle one output pixel at a time. 945 for (int outp = 0; outp < num_output_pixels; outp++) { 946 const uint8* local_filter_ptr = filter_ptr; 947 const uint8* local_input_ptr = input_ptr; 948 int ic = 0; 949 // Handle 16 input channels at a time. 950 for (; ic <= input_depth - 16; ic += 16) { 951 // Load the filters, add filter_offset. 952 uint8x8_t filter_u8[2]; 953 for (int i = 0; i < 2; i++) { 954 filter_u8[i] = vld1_u8(local_filter_ptr + 8 * i); 955 } 956 local_filter_ptr += 16; 957 int16x8_t filter[2]; 958 for (int i = 0; i < 2; i++) { 959 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 960 } 961 for (int i = 0; i < 2; i++) { 962 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 963 } 964 // Load the inputs, add input_offset. 965 uint8x8_t input_u8[2]; 966 for (int i = 0; i < 2; i++) { 967 input_u8[i] = vld1_u8(local_input_ptr + 8 * i); 968 } 969 local_input_ptr += 16; 970 int16x8_t input[2]; 971 for (int i = 0; i < 2; i++) { 972 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 973 } 974 for (int i = 0; i < 2; i++) { 975 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 976 } 977 // Load the accumulators from acc_buffer 978 int32x4_t acc[4]; 979 for (int i = 0; i < 4; i++) { 980 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 981 } 982 // Multiply-accumulate 983 for (int i = 0; i < 2; i++) { 984 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), 985 vget_low_s16(filter[i])); 986 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), 987 vget_high_s16(filter[i])); 988 } 989 // Store the accumulators back to acc_buffer 990 for (int i = 0; i < 4; i++) { 991 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 992 } 993 acc_buffer_ptr += 16; 994 } 995 // Handle 8 input channels at a time. 996 for (; ic <= input_depth - 8; ic += 8) { 997 // Load the filters, add filter_offset. 998 const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); 999 local_filter_ptr += 8; 1000 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 1001 const int16x8_t filter = 1002 vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 1003 // Load the inputs, add input_offset. 1004 const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 1005 local_input_ptr += 8; 1006 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 1007 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 1008 // Load the accumulators from acc_buffer 1009 int32x4_t acc[2]; 1010 for (int i = 0; i < 2; i++) { 1011 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1012 } 1013 // Multiply-accumulate 1014 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); 1015 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); 1016 // Store the accumulators back to acc_buffer 1017 for (int i = 0; i < 2; i++) { 1018 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1019 } 1020 acc_buffer_ptr += 8; 1021 } 1022 // Handle one input channel at a time. 1023 for (; ic < input_depth; ic++) { 1024 const int16 input_val = *local_input_ptr++ + input_offset; 1025 const int16 filter_val = *local_filter_ptr++ + filter_offset; 1026 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 1027 } 1028 input_ptr += input_ptr_increment; 1029 } 1030 } 1031 }; 1032 1033 template <> 1034 struct QuantizedDepthwiseConvKernel<true, 16, 1> { 1035 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1036 const uint8* input_ptr, int16 input_offset, 1037 int input_ptr_increment, const uint8* filter_ptr, 1038 int16 filter_offset, int32* acc_buffer_ptr) { 1039 // Load the filters, add filter_offset. 1040 uint8x8_t filter_u8[2]; 1041 for (int i = 0; i < 2; i++) { 1042 filter_u8[i] = vld1_u8(filter_ptr + 8 * i); 1043 } 1044 int16x8_t filter[2]; 1045 for (int i = 0; i < 2; i++) { 1046 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 1047 } 1048 for (int i = 0; i < 2; i++) { 1049 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 1050 } 1051 // Handle one output pixel at a time. 1052 for (int outp = 0; outp < num_output_pixels; outp++) { 1053 // Load the inputs, add input_offset. 1054 uint8x8_t input_u8[2]; 1055 for (int i = 0; i < 2; i++) { 1056 input_u8[i] = vld1_u8(input_ptr + 8 * i); 1057 } 1058 input_ptr += input_ptr_increment; 1059 int16x8_t input[2]; 1060 for (int i = 0; i < 2; i++) { 1061 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 1062 } 1063 for (int i = 0; i < 2; i++) { 1064 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 1065 } 1066 // Load the accumulators from acc_buffer 1067 int32x4_t acc[4]; 1068 for (int i = 0; i < 4; i++) { 1069 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1070 } 1071 // Multiply-accumulate 1072 for (int i = 0; i < 2; i++) { 1073 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), 1074 vget_low_s16(filter[i])); 1075 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), 1076 vget_high_s16(filter[i])); 1077 } 1078 // Store the accumulators back to acc_buffer 1079 for (int i = 0; i < 4; i++) { 1080 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1081 } 1082 acc_buffer_ptr += 16; 1083 } 1084 } 1085 }; 1086 1087 template <> 1088 struct QuantizedDepthwiseConvKernel<true, 1, 16> { 1089 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1090 const uint8* input_ptr, int16 input_offset, 1091 int input_ptr_increment, const uint8* filter_ptr, 1092 int16 filter_offset, int32* acc_buffer_ptr) { 1093 // Load the filters, add filter_offset. 1094 uint8x8_t filter_u8[2]; 1095 for (int i = 0; i < 2; i++) { 1096 filter_u8[i] = vld1_u8(filter_ptr + 8 * i); 1097 } 1098 int16x8_t filter[2]; 1099 for (int i = 0; i < 2; i++) { 1100 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 1101 } 1102 for (int i = 0; i < 2; i++) { 1103 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 1104 } 1105 // Handle one output pixel at a time. 1106 for (int outp = 0; outp < num_output_pixels; outp++) { 1107 uint8 input_u8 = *input_ptr; 1108 input_ptr += input_ptr_increment; 1109 int16 input = static_cast<int16>(input_u8 + input_offset); 1110 // Load the accumulators from acc_buffer 1111 int32x4_t acc[4]; 1112 for (int i = 0; i < 4; i++) { 1113 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1114 } 1115 // Multiply-accumulate 1116 for (int i = 0; i < 2; i++) { 1117 acc[2 * i + 0] = 1118 vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); 1119 acc[2 * i + 1] = 1120 vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); 1121 } 1122 // Store the accumulators back to acc_buffer 1123 for (int i = 0; i < 4; i++) { 1124 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1125 } 1126 acc_buffer_ptr += 16; 1127 } 1128 } 1129 }; 1130 1131 template <> 1132 struct QuantizedDepthwiseConvKernel<true, 1, 8> { 1133 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1134 const uint8* input_ptr, int16 input_offset, 1135 int input_ptr_increment, const uint8* filter_ptr, 1136 int16 filter_offset, int32* acc_buffer_ptr) { 1137 // Load the filters, add filter_offset. 1138 const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 1139 const int16x8_t filter = vaddq_s16( 1140 vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); 1141 // Handle one output pixel at a time. 1142 for (int outp = 0; outp < num_output_pixels; outp++) { 1143 uint8 input_u8 = *input_ptr; 1144 input_ptr += input_ptr_increment; 1145 int16 input = static_cast<int16>(input_u8 + input_offset); 1146 // Load the accumulators from acc_buffer 1147 int32x4_t acc[2]; 1148 for (int i = 0; i < 2; i++) { 1149 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1150 } 1151 // Multiply-accumulate 1152 acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); 1153 acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); 1154 // Store the accumulators back to acc_buffer 1155 for (int i = 0; i < 2; i++) { 1156 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1157 } 1158 acc_buffer_ptr += 8; 1159 } 1160 } 1161 }; 1162 #endif 1163 1164 // Accumulates the effect of one row of the filter, on a segment of one row 1165 // of the output, accessing the corresponding one row of the input. 1166 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 1167 void QuantizedDepthwiseConvAccumRow( 1168 int stride, int input_depth, int input_width, const uint8* input_data, 1169 int16 input_offset, int pad_width, int depth_multiplier, int filter_width, 1170 const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, 1171 int out_x_buffer_end, int output_depth, int32* acc_buffer) { 1172 #ifdef GEMMLOWP_PROFILING 1173 gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); 1174 #endif 1175 // Sanity check parameters. This is important in particular to ensure 1176 // that we keep the number of template instantiations minimal, so we don't 1177 // increase binary size unnecessarily. 1178 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); 1179 static_assert(kFixedInputDepth || kAllowStrided, ""); 1180 DCHECK(stride == 1 || kAllowStrided); 1181 if (kFixedInputDepth) { 1182 DCHECK_EQ(input_depth, kFixedInputDepth); 1183 } 1184 if (kFixedDepthMultiplier) { 1185 DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); 1186 } 1187 DCHECK_EQ(output_depth, input_depth * depth_multiplier); 1188 const int input_ptr_increment = stride * input_depth; 1189 const uint8* filter_base_ptr = filter_data; 1190 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 1191 // For the current (filter_x, filter_y) point in the filter, 1192 // compute the boundaries of the corresponding output row segment. 1193 int out_x_loop_start_unclampled = 0; 1194 int out_x_loop_end_unclampled = 0; 1195 if (kAllowStrided) { 1196 if (stride == 2) { 1197 out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; 1198 out_x_loop_end_unclampled = 1199 (pad_width + input_width - filter_x + 1) / 2; 1200 } else if (stride == 4) { 1201 out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; 1202 out_x_loop_end_unclampled = 1203 (pad_width + input_width - filter_x + 3) / 4; 1204 } else { 1205 out_x_loop_start_unclampled = 1206 (pad_width - filter_x + stride - 1) / stride; 1207 out_x_loop_end_unclampled = 1208 (pad_width + input_width - filter_x + stride - 1) / stride; 1209 } 1210 } else { 1211 out_x_loop_start_unclampled = pad_width - filter_x; 1212 out_x_loop_end_unclampled = pad_width + input_width - filter_x; 1213 } 1214 // The kernel will have to iterate on the segment of the 1215 // output row that starts at out_x_loop_start and out_x_loop_end. 1216 const int out_x_loop_start = 1217 std::max(out_x_buffer_start, out_x_loop_start_unclampled); 1218 const int out_x_loop_end = 1219 std::min(out_x_buffer_end, out_x_loop_end_unclampled); 1220 1221 int32* acc_buffer_ptr = 1222 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 1223 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 1224 const uint8* input_ptr = input_data + in_x_origin * input_depth; 1225 const int num_output_pixels = out_x_loop_end - out_x_loop_start; 1226 QuantizedDepthwiseConvKernel< 1227 kAllowStrided, kFixedInputDepth, 1228 kFixedDepthMultiplier>::Run(num_output_pixels, input_depth, 1229 depth_multiplier, input_ptr, input_offset, 1230 input_ptr_increment, filter_base_ptr, 1231 filter_offset, acc_buffer_ptr); 1232 filter_base_ptr += output_depth; 1233 } 1234 } 1235 1236 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized. 1237 inline void QuantizedDepthwiseConvAccumRowGeneric( 1238 int stride, int input_depth, int input_width, const uint8* input_data, 1239 int16 input_offset, int pad_width, int depth_multiplier, int filter_width, 1240 const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, 1241 int out_x_buffer_end, int output_depth, int32* acc_buffer) { 1242 gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); 1243 const uint8* filter_base_ptr = filter_data; 1244 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 1245 const int out_x_loop_start = std::max( 1246 out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); 1247 const int out_x_loop_end = 1248 std::min(out_x_buffer_end, 1249 (pad_width + input_width - filter_x + stride - 1) / stride); 1250 1251 int32* acc_buffer_ptr = 1252 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 1253 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 1254 const uint8* input_ptr = input_data + in_x_origin * input_depth; 1255 const int input_ptr_increment = (stride - 1) * input_depth; 1256 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { 1257 const uint8* filter_ptr = filter_base_ptr; 1258 for (int ic = 0; ic < input_depth; ++ic) { 1259 const int16 input_val = *input_ptr++ + input_offset; 1260 for (int m = 0; m < depth_multiplier; m++) { 1261 const int16 filter_val = *filter_ptr++ + filter_offset; 1262 *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 1263 } 1264 } 1265 input_ptr += input_ptr_increment; 1266 } 1267 filter_base_ptr += output_depth; 1268 } 1269 } 1270 1271 // Initializes the accumulator buffer with bias values. 1272 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, 1273 const int32* bias_data, 1274 int32* acc_buffer) { 1275 int i = 0; 1276 #ifdef USE_NEON 1277 if (output_depth == 1) { 1278 const int32x4_t b = vdupq_n_s32(bias_data[0]); 1279 for (; i <= num_output_pixels - 16; i += 16) { 1280 vst1q_s32(acc_buffer + i + 0, b); 1281 vst1q_s32(acc_buffer + i + 4, b); 1282 vst1q_s32(acc_buffer + i + 8, b); 1283 vst1q_s32(acc_buffer + i + 12, b); 1284 } 1285 for (; i <= num_output_pixels - 4; i += 4) { 1286 vst1q_s32(acc_buffer + i, b); 1287 } 1288 } else if (output_depth == 2) { 1289 int32x4_t b = vdupq_n_s32(bias_data[0]); 1290 b = vsetq_lane_s32(bias_data[1], b, 1); 1291 b = vsetq_lane_s32(bias_data[1], b, 3); 1292 for (; i <= num_output_pixels - 8; i += 8) { 1293 vst1q_s32(acc_buffer + 2 * i + 0, b); 1294 vst1q_s32(acc_buffer + 2 * i + 4, b); 1295 vst1q_s32(acc_buffer + 2 * i + 8, b); 1296 vst1q_s32(acc_buffer + 2 * i + 12, b); 1297 } 1298 for (; i <= num_output_pixels - 2; i += 2) { 1299 vst1q_s32(acc_buffer + 2 * i, b); 1300 } 1301 } else if (output_depth == 4) { 1302 const int32x4_t b = vld1q_s32(bias_data); 1303 for (; i <= num_output_pixels - 4; i += 4) { 1304 vst1q_s32(acc_buffer + 4 * i + 0, b); 1305 vst1q_s32(acc_buffer + 4 * i + 4, b); 1306 vst1q_s32(acc_buffer + 4 * i + 8, b); 1307 vst1q_s32(acc_buffer + 4 * i + 12, b); 1308 } 1309 for (; i < num_output_pixels; i++) { 1310 vst1q_s32(acc_buffer + 4 * i, b); 1311 } 1312 } else if (output_depth == 8) { 1313 const int32x4_t b0 = vld1q_s32(bias_data); 1314 const int32x4_t b1 = vld1q_s32(bias_data + 4); 1315 for (; i <= num_output_pixels - 2; i += 2) { 1316 vst1q_s32(acc_buffer + 8 * i + 0, b0); 1317 vst1q_s32(acc_buffer + 8 * i + 4, b1); 1318 vst1q_s32(acc_buffer + 8 * i + 8, b0); 1319 vst1q_s32(acc_buffer + 8 * i + 12, b1); 1320 } 1321 for (; i < num_output_pixels; i++) { 1322 vst1q_s32(acc_buffer + 8 * i + 0, b0); 1323 vst1q_s32(acc_buffer + 8 * i + 4, b1); 1324 } 1325 } else if (output_depth == 16) { 1326 const int32x4_t b0 = vld1q_s32(bias_data); 1327 const int32x4_t b1 = vld1q_s32(bias_data + 4); 1328 const int32x4_t b2 = vld1q_s32(bias_data + 8); 1329 const int32x4_t b3 = vld1q_s32(bias_data + 12); 1330 for (; i < num_output_pixels; i++) { 1331 vst1q_s32(acc_buffer + 16 * i + 0, b0); 1332 vst1q_s32(acc_buffer + 16 * i + 4, b1); 1333 vst1q_s32(acc_buffer + 16 * i + 8, b2); 1334 vst1q_s32(acc_buffer + 16 * i + 12, b3); 1335 } 1336 } 1337 #endif 1338 for (; i < num_output_pixels; i++) { 1339 memcpy(acc_buffer + i * output_depth, bias_data, 1340 sizeof(acc_buffer[0]) * output_depth); 1341 } 1342 } 1343 1344 template <FusedActivationFunctionType Ac> 1345 void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 1346 int32 input_offset, const uint8* filter_data, 1347 const Dims<4>& filter_dims, int32 filter_offset, 1348 const int32* bias_data, const Dims<4>& bias_dims, 1349 int stride_width, int stride_height, 1350 int pad_width, int pad_height, int depth_multiplier, 1351 int32 output_offset, int32 output_multiplier, 1352 int output_shift, int32 output_activation_min, 1353 int32 output_activation_max, uint8* output_data, 1354 const Dims<4>& output_dims) { 1355 gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); 1356 static_assert(Ac == FusedActivationFunctionType::kNone || 1357 Ac == FusedActivationFunctionType::kRelu || 1358 Ac == FusedActivationFunctionType::kRelu6 || 1359 Ac == FusedActivationFunctionType::kRelu1, 1360 ""); 1361 DCHECK_LE(output_activation_min, output_activation_max); 1362 if (Ac == FusedActivationFunctionType::kNone) { 1363 DCHECK_EQ(output_activation_min, 0); 1364 DCHECK_EQ(output_activation_max, 255); 1365 } 1366 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1367 const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); 1368 const int input_height = ArraySize(input_dims, 2); 1369 const int input_width = ArraySize(input_dims, 1); 1370 const int input_depth = ArraySize(input_dims, 0); 1371 const int filter_height = ArraySize(filter_dims, 2); 1372 const int filter_width = ArraySize(filter_dims, 1); 1373 const int output_height = ArraySize(output_dims, 2); 1374 const int output_width = ArraySize(output_dims, 1); 1375 DCHECK(output_depth == input_depth * depth_multiplier); 1376 1377 static const int kAccBufferMaxSize = 1024; 1378 int32 acc_buffer[kAccBufferMaxSize]; 1379 DCHECK_GE(kAccBufferMaxSize, output_depth); 1380 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; 1381 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; 1382 DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize); 1383 DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); 1384 DCHECK_GE(kOutputPixelsInAccBuffer, 1); 1385 1386 // row_accum_func will point to the core accumulation function to be used 1387 // for this DepthwiseConv op. 1388 auto* row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; 1389 1390 const int kMaxFixedDepthMultiplier = 16; 1391 int fixed_depth_multiplier = 0; 1392 if (depth_multiplier <= kMaxFixedDepthMultiplier) { 1393 fixed_depth_multiplier = depth_multiplier; 1394 } 1395 // kMaxUnrolling is the max number of output values that we aim to handle 1396 // in one unrolled iteration of the inner loop. For practical performance 1397 // reasons, it is limited by the number of available registers. We could 1398 // fine-tune it depending on the architecture, but that's not worth doing 1399 // since this whole code is not very optimized to begin with. The 1400 // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit 1401 // vector registers. 1402 const int kMaxUnrolling = 16; 1403 int fixed_input_depth = 0; 1404 if (fixed_depth_multiplier && 1405 input_depth * fixed_depth_multiplier <= kMaxUnrolling) { 1406 fixed_input_depth = input_depth; 1407 } 1408 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 1409 FIXED_DEPTH_MULTIPLIER) \ 1410 if ((stride_width == 1 || ALLOW_STRIDED) && \ 1411 fixed_input_depth == FIXED_INPUT_DEPTH && \ 1412 fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ 1413 row_accum_func = \ 1414 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 1415 FIXED_DEPTH_MULTIPLIER>; \ 1416 } 1417 1418 #ifdef USE_NEON 1419 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) 1420 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) 1421 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) 1422 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) 1423 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) 1424 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) 1425 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) 1426 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) 1427 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) 1428 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) 1429 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) 1430 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) 1431 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) 1432 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) 1433 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) 1434 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) 1435 #endif // USE_NEON 1436 1437 #undef TFMINI_USE_DEPTHWISECONV_KERNEL 1438 1439 // Now that we have determined row_accum_func, we can start work. 1440 uint8* output_ptr = output_data; 1441 for (int b = 0; b < batches; ++b) { 1442 for (int out_y = 0; out_y < output_height; ++out_y) { 1443 const int in_y_origin = (out_y * stride_height) - pad_height; 1444 const int filter_y_start = std::max(0, -in_y_origin); 1445 const int filter_y_end = 1446 std::min(filter_height, input_height - in_y_origin); 1447 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; 1448 out_x_buffer_start += kOutputPixelsInAccBuffer) { 1449 const int out_x_buffer_end = std::min( 1450 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); 1451 // We call a 'pixel' a group of activation that share all but the 1452 // 'depth'/'channel' coordinate. num_output_pixels is the number of 1453 // output pixels that we will accumulate in this loop iteration. 1454 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; 1455 // Initialize our local accumulator with the bias values, so we don't 1456 // have to add them later. 1457 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, 1458 acc_buffer); 1459 // Accumulation loop. Most of the time should be spent in here. 1460 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1461 ++filter_y) { 1462 const int in_y = in_y_origin + filter_y; 1463 row_accum_func( 1464 stride_width, input_depth, input_width, 1465 input_data + in_y * input_dims.strides[2] + 1466 b * input_dims.strides[3], 1467 input_offset, pad_width, depth_multiplier, filter_width, 1468 filter_data + filter_y * filter_dims.strides[2], filter_offset, 1469 out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); 1470 } 1471 // Finished accumulating int32 values. Now need to convert them to 1472 // the final 8bit form and store them. 1473 gemmlowp::ScopedProfilingLabel label("downquantize+store"); 1474 const int num_output_values = output_depth * num_output_pixels; 1475 int i = 0; 1476 #ifdef USE_NEON 1477 using gemmlowp::RoundingDivideByPOT; 1478 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); 1479 const int32x4_t output_activation_min_vec = 1480 vdupq_n_s32(output_activation_min); 1481 const int32x4_t output_activation_max_vec = 1482 vdupq_n_s32(output_activation_max); 1483 // Handle 16 values at once. 1484 // This allows us to issue 4 mutually independent int32 1485 // multiplications (vqrdmulh), which should alleviate most of their 1486 // high latency. 1487 for (; i <= num_output_values - 16; i += 16) { 1488 int32x4_t acc[4]; 1489 for (int j = 0; j < 4; j++) { 1490 acc[j] = vld1q_s32(acc_buffer + i + 4 * j); 1491 } 1492 1493 // Fixed-point multiplication. 1494 for (int j = 0; j < 4; j++) { 1495 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); 1496 } 1497 for (int j = 0; j < 4; j++) { 1498 acc[j] = RoundingDivideByPOT(acc[j], output_shift); 1499 } 1500 // Add the output offset. 1501 for (int j = 0; j < 4; j++) { 1502 acc[j] = vaddq_s32(acc[j], output_offset_vec); 1503 } 1504 // Apply the activation function. 1505 if (Ac != FusedActivationFunctionType::kNone) { 1506 for (int j = 0; j < 4; j++) { 1507 acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); 1508 } 1509 for (int j = 0; j < 4; j++) { 1510 acc[j] = vminq_s32(acc[j], output_activation_max_vec); 1511 } 1512 } 1513 // Saturating cast to uint8 and store to destination. 1514 int16x4_t acc_s16[4]; 1515 for (int j = 0; j < 4; j++) { 1516 acc_s16[j] = vqmovn_s32(acc[j]); 1517 } 1518 const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); 1519 const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); 1520 const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); 1521 const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); 1522 vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); 1523 output_ptr += 16; 1524 } 1525 // Handle 8 values at once. 1526 // Not as good as 16 (now we're only issuing 2 mutually independent 1527 // vqrdmulh instructions, so we're probably paying for their high 1528 // latency). 1529 for (; i <= num_output_values - 8; i += 8) { 1530 int32x4_t acc0 = vld1q_s32(acc_buffer + i); 1531 int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); 1532 // Fixed-point multiplication. 1533 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); 1534 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); 1535 // Rounding right shift. 1536 acc0 = RoundingDivideByPOT(acc0, output_shift); 1537 acc1 = RoundingDivideByPOT(acc1, output_shift); 1538 // Add the output offset. 1539 acc0 = vaddq_s32(acc0, output_offset_vec); 1540 acc1 = vaddq_s32(acc1, output_offset_vec); 1541 // Apply the activation function. 1542 if (Ac != FusedActivationFunctionType::kNone) { 1543 acc0 = vmaxq_s32(acc0, output_activation_min_vec); 1544 acc1 = vmaxq_s32(acc1, output_activation_min_vec); 1545 acc0 = vminq_s32(acc0, output_activation_max_vec); 1546 acc1 = vminq_s32(acc1, output_activation_max_vec); 1547 } 1548 // Saturating cast to uint8 and store to destination. 1549 const int16x4_t acc0_s16 = vqmovn_s32(acc0); 1550 const int16x4_t acc1_s16 = vqmovn_s32(acc1); 1551 const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); 1552 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 1553 vst1_u8(output_ptr, res_u8); 1554 output_ptr += 8; 1555 } 1556 // Handle 4 values at once. Now we're paying the full price of the 1557 // high latency of vqrdmulh. Also, storing only 4 bytes at the end 1558 // (without any alignment) can only be done 1 byte at a time. 1559 // Yet, that is still worth doing to minimize the amount of leftover 1560 // that will have to go through the very slow scalar code. 1561 for (; i <= num_output_values - 4; i += 4) { 1562 int32x4_t acc = vld1q_s32(acc_buffer + i); 1563 // Fixed-point multiplication. 1564 acc = vqrdmulhq_n_s32(acc, output_multiplier); 1565 // Rounding right shift. 1566 acc = RoundingDivideByPOT(acc, output_shift); 1567 // Add the output offset. 1568 acc = vaddq_s32(acc, output_offset_vec); 1569 // Apply the activation function. 1570 if (Ac != FusedActivationFunctionType::kNone) { 1571 acc = vmaxq_s32(acc, output_activation_min_vec); 1572 acc = vminq_s32(acc, output_activation_max_vec); 1573 } 1574 // Saturating cast to uint8 and store to destination. 1575 const int16x4_t acc_s16 = vqmovn_s32(acc); 1576 const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); 1577 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 1578 vst1_lane_u8(output_ptr + 0, res_u8, 0); 1579 vst1_lane_u8(output_ptr + 1, res_u8, 1); 1580 vst1_lane_u8(output_ptr + 2, res_u8, 2); 1581 vst1_lane_u8(output_ptr + 3, res_u8, 3); 1582 output_ptr += 4; 1583 } 1584 #endif // USE_NEON 1585 1586 // Handle leftover values, one by one. This is very slow. 1587 for (; i < num_output_values; i++) { 1588 int32 acc = acc_buffer[i]; 1589 acc = MultiplyByQuantizedMultiplierSmallerThanOne( 1590 acc, output_multiplier, output_shift); 1591 acc += output_offset; 1592 acc = std::max(acc, output_activation_min); 1593 acc = std::min(acc, output_activation_max); 1594 *output_ptr++ = static_cast<uint8>(acc); 1595 } 1596 } 1597 } 1598 } 1599 } 1600 1601 } // namespace optimized_ops 1602 } // namespace nn 1603 } // namespace android 1604 1605 #endif // ANDROID_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 1606