Home | History | Annotate | Download | only in optimized
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
     16 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
     17 
     18 #include "fixedpoint/fixedpoint.h"
     19 #include "public/gemmlowp.h"
     20 #include "tensorflow/contrib/lite/kernels/internal/common.h"
     21 #include "tensorflow/contrib/lite/kernels/internal/types.h"
     22 
     23 namespace tflite {
     24 namespace optimized_ops {
     25 
     26 // Implementation of quantized DepthwiseConv
     27 
     28 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
     29 struct QuantizedDepthwiseConvKernel {};
     30 
     31 #ifdef USE_NEON
     32 template <>
     33 struct QuantizedDepthwiseConvKernel<true, 8, 2> {
     34   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
     35                   const uint8* input_ptr, int16 input_offset,
     36                   int input_ptr_increment, const uint8* filter_ptr,
     37                   int16 filter_offset, int32* acc_buffer_ptr) {
     38     // Load the filters, add filter_offset.
     39     uint8x8x2_t filter_u8;
     40     filter_u8.val[0] = vld1_u8(filter_ptr);
     41     filter_u8.val[1] = vld1_u8(filter_ptr + 8);
     42     int16x8_t filter[2];
     43     for (int i = 0; i < 2; i++) {
     44       filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
     45                             vdupq_n_s16(filter_offset));
     46     }
     47     // Handle one output pixel at a time.
     48     for (int outp = 0; outp < num_output_pixels; outp++) {
     49       // Load the accumulators from acc_buffer
     50       int32x4x2_t acc[2];
     51       for (int i = 0; i < 2; i++) {
     52         acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
     53         acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
     54       }
     55       // Load the inputs, add input_offset.
     56       const uint8x8_t input_u8 = vld1_u8(input_ptr);
     57       input_ptr += input_ptr_increment;
     58       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
     59       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
     60       // Duplicate the input values, 2-fold
     61       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
     62       // Multiply-accumulate
     63       for (int i = 0; i < 2; i++) {
     64         acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
     65                                   vget_low_s16(input_dup2.val[i]));
     66         acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
     67                                   vget_high_s16(input_dup2.val[i]));
     68       }
     69       // Store the accumulators back to acc_buffer
     70       for (int i = 0; i < 2; i++) {
     71         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
     72         vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
     73       }
     74       acc_buffer_ptr += 16;
     75     }
     76   }
     77 };
     78 
     79 template <>
     80 struct QuantizedDepthwiseConvKernel<false, 8, 1> {
     81   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
     82                   const uint8* input_ptr, int16 input_offset,
     83                   int input_ptr_increment, const uint8* filter_ptr,
     84                   int16 filter_offset, int32* acc_buffer_ptr) {
     85     // Load the filters, add filter_offset.
     86     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
     87     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
     88     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
     89 
     90     int outp = 0;
     91     // Handle 2 output pixels at a time.
     92     for (; outp <= num_output_pixels - 2; outp += 2) {
     93       // Load the accumulators from acc_buffer.
     94       int32x4_t acc[4];
     95       for (int i = 0; i < 4; i++) {
     96         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
     97       }
     98       // Load the inputs, add input_offset.
     99       uint8x8_t input_u8[2];
    100       for (int i = 0; i < 2; i++) {
    101         input_u8[i] = vld1_u8(input_ptr + 8 * i);
    102       }
    103       input_ptr += 16;
    104       int16x8_t input[2];
    105       for (int i = 0; i < 2; i++) {
    106         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
    107       }
    108       for (int i = 0; i < 2; i++) {
    109         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
    110       }
    111       // Multiply-accumulate.
    112       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
    113       acc[1] =
    114           vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
    115       acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
    116       acc[3] =
    117           vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
    118       // Store the accumulators back to acc_buffer
    119       for (int i = 0; i < 4; i++) {
    120         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    121       }
    122       acc_buffer_ptr += 16;
    123     }
    124     // Handle 1 output pixel at a time.
    125     for (; outp < num_output_pixels; outp++) {
    126       // Load the accumulators from acc_buffer.
    127       int32x4_t acc[2];
    128       acc[0] = vld1q_s32(acc_buffer_ptr);
    129       acc[1] = vld1q_s32(acc_buffer_ptr + 4);
    130 
    131       // Load the inputs, add input_offset.
    132       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    133       input_ptr += 8;
    134       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    135       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    136       // Multiply-accumulate.
    137       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
    138       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
    139       // Store the accumulators back to acc_buffer
    140       vst1q_s32(acc_buffer_ptr, acc[0]);
    141       vst1q_s32(acc_buffer_ptr + 4, acc[1]);
    142       acc_buffer_ptr += 8;
    143     }
    144   }
    145 };
    146 
    147 template <>
    148 struct QuantizedDepthwiseConvKernel<false, 4, 2> {
    149   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    150                   const uint8* input_ptr, int16 input_offset,
    151                   int input_ptr_increment, const uint8* filter_ptr,
    152                   int16 filter_offset, int32* acc_buffer_ptr) {
    153     // Load the filters, add filter_offset.
    154     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
    155     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    156     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    157 
    158     int outp = 0;
    159     // Handle 2 output pixels at a time.
    160     for (; outp <= num_output_pixels - 2; outp += 2) {
    161       // Load the accumulators from acc_buffer
    162       int32x4_t acc[4];
    163       for (int i = 0; i < 4; i++) {
    164         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    165       }
    166       // Load the inputs, add input_offset.
    167       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    168       input_ptr += 8;
    169       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    170       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    171       // Duplicate the input values, 2-fold
    172       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    173       // Multiply-accumulate
    174       for (int i = 0; i < 2; i++) {
    175         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
    176                                    vget_low_s16(input_dup2.val[i]));
    177         acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
    178                                    vget_high_s16(input_dup2.val[i]));
    179       }
    180       // Store the accumulators back to acc_buffer
    181       for (int i = 0; i < 4; i++) {
    182         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    183       }
    184       acc_buffer_ptr += 16;
    185     }
    186     // Handle one output pixel at a time.
    187     for (; outp < num_output_pixels; outp++) {
    188       // Load the accumulators from acc_buffer
    189       int32x4_t acc[2];
    190       for (int i = 0; i < 2; i++) {
    191         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    192       }
    193       // Load the inputs, add input_offset.
    194       uint8x8_t input_u8 = vdup_n_u8(0);
    195       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    196       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    197       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    198       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    199       input_ptr += 4;
    200       const int16x4_t input_s16 =
    201           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    202       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    203       // Duplicate the input values, 2-fold
    204       const int16x4x2_t input_dup2 = vzip_s16(input, input);
    205       // Multiply-accumulate
    206       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
    207       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
    208       // Store the accumulators back to acc_buffer
    209       for (int i = 0; i < 2; i++) {
    210         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    211       }
    212       acc_buffer_ptr += 8;
    213     }
    214   }
    215 };
    216 
    217 template <>
    218 struct QuantizedDepthwiseConvKernel<false, 2, 8> {
    219   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    220                   const uint8* input_ptr, int16 input_offset,
    221                   int input_ptr_increment, const uint8* filter_ptr,
    222                   int16 filter_offset, int32* acc_buffer_ptr) {
    223     // Load the filters, add filter_offset.
    224     int16x8_t filter[2];
    225     for (int i = 0; i < 2; i++) {
    226       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
    227       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    228       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    229     }
    230     int outp = 0;
    231     // Handle two output pixels at a time.
    232     for (; outp <= num_output_pixels - 2; outp += 2) {
    233       // Load the accumulators from acc_buffer.
    234       int32x4_t acc[8];
    235       for (int i = 0; i < 8; i++) {
    236         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    237       }
    238       // Load the inputs, add input_offset.
    239       uint8x8_t input_u8 = vdup_n_u8(0);
    240       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    241       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    242       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    243       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    244       input_ptr += 4;
    245       const int16x4_t input_s16 =
    246           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    247       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    248       // Multiply-accumulate.
    249       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
    250       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
    251       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
    252       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
    253       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
    254       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
    255       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
    256       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
    257       // Store the accumulators back to acc_buffer.
    258       for (int i = 0; i < 8; i++) {
    259         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    260       }
    261       acc_buffer_ptr += 32;
    262     }
    263     // Handle one output pixel at a time.
    264     for (; outp < num_output_pixels; outp++) {
    265       // Load the accumulators from acc_buffer.
    266       int32x4_t acc[4];
    267       for (int i = 0; i < 4; i++) {
    268         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    269       }
    270       // Load the inputs, add input_offset.
    271       uint8x8_t input_u8 = vdup_n_u8(0);
    272       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    273       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    274       input_ptr += 2;
    275       const int16x4_t input_s16 =
    276           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    277       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    278 
    279       // Multiply-accumulate.
    280       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
    281       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
    282       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
    283       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
    284 
    285       // Store the accumulators back to acc_buffer.
    286       for (int i = 0; i < 4; i++) {
    287         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    288       }
    289       acc_buffer_ptr += 16;
    290     }
    291   }
    292 };
    293 
    294 template <>
    295 struct QuantizedDepthwiseConvKernel<false, 2, 2> {
    296   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    297                   const uint8* input_ptr, int16 input_offset,
    298                   int input_ptr_increment, const uint8* filter_ptr,
    299                   int16 filter_offset, int32* acc_buffer_ptr) {
    300     // Load the filters, add filter_offset.
    301     uint8x8_t filter_u8 = vdup_n_u8(0);
    302     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    303     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    304     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    305     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    306     const int16x4_t filter_s16 =
    307         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    308     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    309 
    310     int outp = 0;
    311     // Handle 4 output pixels at a time.
    312     for (; outp <= num_output_pixels - 4; outp += 4) {
    313       // Load the accumulators from acc_buffer
    314       int32x4_t acc[4];
    315       for (int i = 0; i < 4; i++) {
    316         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    317       }
    318 
    319       // Load the inputs, add input_offset.
    320       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    321       input_ptr += 8;
    322       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    323       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    324       // Duplicate the input values, 2-fold
    325       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    326       // Multiply-accumulate
    327       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
    328       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
    329       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
    330       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
    331       // Store the accumulators back to acc_buffer
    332       for (int i = 0; i < 4; i++) {
    333         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    334       }
    335       acc_buffer_ptr += 16;
    336     }
    337     // Handle one output pixel at a time.
    338     for (; outp < num_output_pixels; outp++) {
    339       // Load the accumulators from acc_buffer
    340       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
    341 
    342       uint8x8_t input_u8 = vdup_n_u8(0);
    343       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    344       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    345       input_ptr += 2;
    346       const int16x4_t input_s16 =
    347           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    348       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    349       // Duplicate the input values, 2-fold
    350       const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
    351       // Multiply-accumulate
    352       acc = vmlal_s16(acc, filter, input_dup2);
    353       // Store the accumulators back to acc_buffer
    354       vst1q_s32(acc_buffer_ptr, acc);
    355       acc_buffer_ptr += 4;
    356     }
    357   }
    358 };
    359 
    360 template <>
    361 struct QuantizedDepthwiseConvKernel<false, 2, 1> {
    362   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    363                   const uint8* input_ptr, int16 input_offset,
    364                   int input_ptr_increment, const uint8* filter_ptr,
    365                   int16 filter_offset, int32* acc_buffer_ptr) {
    366     // Load the filters, add filter_offset.
    367     uint8x8_t filter_u8 = vdup_n_u8(0);
    368     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    369     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    370     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    371     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    372     const int16x4_t filter_s16 =
    373         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    374     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    375 
    376     int outp = 0;
    377     // Handle 8 output pixels at a time.
    378     for (; outp <= num_output_pixels - 8; outp += 8) {
    379       // Load the accumulators from acc_buffer.
    380       int32x4_t acc[4];
    381       for (int i = 0; i < 4; i++) {
    382         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    383       }
    384       // Load the inputs, add input_offset.
    385       uint8x8_t input_u8[2];
    386       for (int i = 0; i < 2; i++) {
    387         input_u8[i] = vld1_u8(input_ptr + 8 * i);
    388       }
    389       input_ptr += 16;
    390       int16x8_t input[2];
    391       for (int i = 0; i < 2; i++) {
    392         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
    393       }
    394       for (int i = 0; i < 2; i++) {
    395         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
    396       }
    397 
    398       // Multiply-accumulate.
    399       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
    400       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
    401       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
    402       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
    403       // Store the accumulators back to acc_buffer.
    404       for (int i = 0; i < 4; i++) {
    405         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    406       }
    407       acc_buffer_ptr += 16;
    408     }
    409     // Handle 4 output pixels at a time.
    410     for (; outp <= num_output_pixels - 4; outp += 4) {
    411       // Load the accumulators from acc_buffer.
    412       int32x4_t acc[2];
    413       for (int i = 0; i < 2; i++) {
    414         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    415       }
    416       // Load the inputs, add input_offset.
    417       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    418       input_ptr += 8;
    419       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    420       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    421 
    422       // Multiply-accumulate.
    423       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
    424       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
    425       // Store the accumulators back to acc_buffer.
    426       for (int i = 0; i < 2; i++) {
    427         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    428       }
    429       acc_buffer_ptr += 8;
    430     }
    431     // Handle 2 output pixels at a time.
    432     for (; outp <= num_output_pixels - 2; outp += 2) {
    433       // Load the accumulators from acc_buffer.
    434       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
    435       // Load the inputs, add input_offset.
    436       uint8x8_t input_u8 = vdup_n_u8(0);
    437       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    438       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    439       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    440       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    441       input_ptr += 4;
    442       const int16x4_t input_s16 =
    443           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    444       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    445 
    446       // Multiply-accumulate.
    447       acc = vmlal_s16(acc, filter, input);
    448       // Store the accumulators back to acc_buffer.
    449       vst1q_s32(acc_buffer_ptr, acc);
    450       acc_buffer_ptr += 4;
    451     }
    452     // Handle 1 output pixel at a time.
    453     for (; outp < num_output_pixels; outp++) {
    454       // Load the accumulators from acc_buffer.
    455       int32x2_t acc = vld1_s32(acc_buffer_ptr);
    456       // Load the inputs, add input_offset.
    457       uint8x8_t input_u8 = vdup_n_u8(0);
    458       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    459       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    460       input_ptr += 2;
    461       const int16x4_t input_s16 =
    462           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    463       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    464 
    465       // Multiply-accumulate.
    466       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
    467       // Store the accumulators back to acc_buffer.
    468       vst1_s32(acc_buffer_ptr, acc);
    469       acc_buffer_ptr += 2;
    470     }
    471   }
    472 };
    473 
    474 template <>
    475 struct QuantizedDepthwiseConvKernel<false, 1, 2> {
    476   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    477                   const uint8* input_ptr, int16 input_offset,
    478                   int input_ptr_increment, const uint8* filter_ptr,
    479                   int16 filter_offset, int32* acc_buffer_ptr) {
    480     // Load the filters, add filter_offset.
    481     uint8x8_t filter_u8 = vdup_n_u8(0);
    482     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    483     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    484     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    485     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    486     const int16x4_t filter_s16 =
    487         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    488     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    489 
    490     int outp = 0;
    491     // Handle 8 output pixels at a time.
    492     for (; outp <= num_output_pixels - 8; outp += 8) {
    493       // Load the accumulators from acc_buffer
    494       int32x4_t acc[4];
    495       for (int i = 0; i < 4; i++) {
    496         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    497       }
    498 
    499       // Load the inputs, add input_offset.
    500       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    501       input_ptr += 8;
    502       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    503       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    504       // Duplicate the input values, 2-fold
    505       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    506       // Multiply-accumulate
    507       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
    508       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
    509       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
    510       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
    511       // Store the accumulators back to acc_buffer
    512       for (int i = 0; i < 4; i++) {
    513         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    514       }
    515       acc_buffer_ptr += 16;
    516     }
    517     // Handle one output pixel at a time.
    518     for (; outp < num_output_pixels; outp++) {
    519       // Load the accumulators from acc_buffer
    520       int32x2_t acc = vld1_s32(acc_buffer_ptr);
    521 
    522       // Load the inputs, add input_offset.
    523       const uint32 input = *input_ptr++ + input_offset;
    524 
    525       // Multiply-accumulate
    526       acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
    527       // Store the accumulators back to acc_buffer
    528       vst1_s32(acc_buffer_ptr, acc);
    529       acc_buffer_ptr += 2;
    530     }
    531   }
    532 };
    533 
    534 template <>
    535 struct QuantizedDepthwiseConvKernel<false, 1, 4> {
    536   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    537                   const uint8* input_ptr, int16 input_offset,
    538                   int input_ptr_increment, const uint8* filter_ptr,
    539                   int16 filter_offset, int32* acc_buffer_ptr) {
    540     // Load the filters, add filter_offset.
    541     uint8x8_t filter_u8 = vdup_n_u8(0);
    542     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    543     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    544     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    545     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    546     const int16x4_t filter_s16 =
    547         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    548     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    549 
    550     int outp = 0;
    551     // Handle 8 output pixels at a time.
    552     for (; outp <= num_output_pixels - 8; outp += 8) {
    553       // Load the accumulators from acc_buffer
    554       int32x4_t acc[8];
    555       for (int i = 0; i < 8; i++) {
    556         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    557       }
    558 
    559       // Load the inputs, add input_offset.
    560       uint8x8_t input_u8 = vld1_u8(input_ptr);
    561       input_ptr += 8;
    562       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    563       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    564 
    565       // Multiply-accumulate
    566       acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
    567       acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
    568       acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
    569       acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
    570       acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
    571       acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
    572       acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
    573       acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
    574 
    575       // Store the accumulators back to acc_buffer
    576       for (int i = 0; i < 8; i++) {
    577         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    578       }
    579       acc_buffer_ptr += 32;
    580     }
    581     // Handle 4 output pixels at a time.
    582     for (; outp <= num_output_pixels - 4; outp += 4) {
    583       // Load the accumulators from acc_buffer
    584       int32x4_t acc[4];
    585       for (int i = 0; i < 4; i++) {
    586         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    587       }
    588 
    589       // Load the inputs, add input_offset.
    590       uint8x8_t input_u8 = vdup_n_u8(0);
    591       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    592       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    593       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    594       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    595       input_ptr += 4;
    596       const int16x4_t input_s16 =
    597           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    598       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    599 
    600       // Multiply-accumulate
    601       acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
    602       acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
    603       acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
    604       acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
    605 
    606       // Store the accumulators back to acc_buffer
    607       for (int i = 0; i < 4; i++) {
    608         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    609       }
    610       acc_buffer_ptr += 16;
    611     }
    612     // Handle one output pixel at a time.
    613     for (; outp < num_output_pixels; outp++) {
    614       // Load the accumulators from acc_buffer
    615       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
    616 
    617       // Load the inputs, add input_offset.
    618       const uint32 input = *input_ptr++ + input_offset;
    619 
    620       // Multiply-accumulate
    621       acc = vmlal_n_s16(acc, filter, input);
    622       // Store the accumulators back to acc_buffer
    623       vst1q_s32(acc_buffer_ptr, acc);
    624       acc_buffer_ptr += 4;
    625     }
    626   }
    627 };
    628 
    629 template <>
    630 struct QuantizedDepthwiseConvKernel<false, 4, 1> {
    631   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    632                   const uint8* input_ptr, int16 input_offset,
    633                   int input_ptr_increment, const uint8* filter_ptr,
    634                   int16 filter_offset, int32* acc_buffer_ptr) {
    635     // Load the filters, add filter_offset.
    636     uint8x8_t filter_u8 = vdup_n_u8(0);
    637     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    638     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    639     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    640     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    641     const int16x4_t filter_s16 =
    642         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    643     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    644 
    645     int outp = 0;
    646     // Handle 4 output pixels at a time.
    647     for (; outp <= num_output_pixels - 4; outp += 4) {
    648       // Load the accumulators from acc_buffer
    649       int32x4_t acc[4];
    650       for (int i = 0; i < 4; i++) {
    651         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    652       }
    653       // Load the inputs, add input_offset.
    654       int16x8_t input[2];
    655       for (int i = 0; i < 2; i++) {
    656         const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
    657         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    658         input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    659       }
    660       input_ptr += 16;
    661       // Multiply-accumulate
    662       for (int i = 0; i < 2; i++) {
    663         acc[2 * i + 0] =
    664             vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
    665         acc[2 * i + 1] =
    666             vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
    667       }
    668       // Store the accumulators back to acc_buffer
    669       for (int i = 0; i < 4; i++) {
    670         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    671       }
    672       acc_buffer_ptr += 16;
    673     }
    674     // Handle one output pixel at a time.
    675     for (; outp < num_output_pixels; outp++) {
    676       // Load the accumulators from acc_buffer
    677       int32x4_t acc;
    678       acc = vld1q_s32(acc_buffer_ptr);
    679 
    680       // Load the inputs, add input_offset.
    681       uint8x8_t input_u8 = vdup_n_u8(0);
    682       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    683       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    684       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    685       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    686       input_ptr += 4;
    687       const int16x4_t input_s16 =
    688           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    689       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    690       // Multiply-accumulate
    691       acc = vmlal_s16(acc, filter, input);
    692       // Store the accumulators back to acc_buffer
    693       vst1q_s32(acc_buffer_ptr, acc);
    694       acc_buffer_ptr += 4;
    695     }
    696   }
    697 };
    698 
    699 template <>
    700 struct QuantizedDepthwiseConvKernel<false, 4, 4> {
    701   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    702                   const uint8* input_ptr, int16 input_offset,
    703                   int input_ptr_increment, const uint8* filter_ptr,
    704                   int16 filter_offset, int32* acc_buffer_ptr) {
    705     // Load the filters, add filter_offset.
    706     int16x8_t filter[2];
    707     for (int i = 0; i < 2; i++) {
    708       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
    709       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    710       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    711     }
    712 
    713     int outp = 0;
    714     // Handle 2 output pixels at a time.
    715     for (; outp <= num_output_pixels - 2; outp += 2) {
    716       // Load the accumulators from acc_buffer
    717       int32x4_t acc[8];
    718       for (int i = 0; i < 8; i++) {
    719         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    720       }
    721 
    722       // Load the inputs, add input_offset.
    723       uint8x8_t input_u8 = vld1_u8(input_ptr);
    724       input_ptr += 8;
    725       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    726       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    727 
    728       // Multiply-accumulate
    729       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
    730                               vget_low_s16(input), 0);
    731       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
    732                               vget_low_s16(input), 1);
    733       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
    734                               vget_low_s16(input), 2);
    735       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
    736                               vget_low_s16(input), 3);
    737       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
    738                               vget_high_s16(input), 0);
    739       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
    740                               vget_high_s16(input), 1);
    741       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
    742                               vget_high_s16(input), 2);
    743       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
    744                               vget_high_s16(input), 3);
    745       // Store the accumulators back to acc_buffer
    746       for (int i = 0; i < 8; i++) {
    747         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    748       }
    749       acc_buffer_ptr += 32;
    750     }
    751     // Handle one output pixel at a time.
    752     for (; outp < num_output_pixels; outp++) {
    753       // Load the accumulators from acc_buffer
    754       int32x4_t acc[4];
    755       for (int i = 0; i < 4; i++) {
    756         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    757       }
    758 
    759       // Load the inputs, add input_offset.
    760       uint8x8_t input_u8 = vdup_n_u8(0);
    761       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    762       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    763       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    764       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    765       input_ptr += 4;
    766       const int16x4_t input_s16 =
    767           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    768       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    769 
    770       // Multiply-accumulate
    771       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
    772       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
    773       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
    774       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
    775       // Store the accumulators back to acc_buffer
    776       for (int i = 0; i < 4; i++) {
    777         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    778       }
    779       acc_buffer_ptr += 16;
    780     }
    781   }
    782 };
    783 
    784 template <>
    785 struct QuantizedDepthwiseConvKernel<true, 0, 3> {
    786   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    787                   const uint8* input_ptr, int16 input_offset,
    788                   int input_ptr_increment, const uint8* filter_ptr,
    789                   int16 filter_offset, int32* acc_buffer_ptr) {
    790     // We will have to duplicate bytes in a NEON register, 3-fold.
    791     // We will do that by register-level table-look-up using VTBL instructions.
    792     // Here we prepare the registers containing the table-lookup indices.
    793     static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
    794                                                    {2, 3, 3, 3, 4, 4, 4, 5},
    795                                                    {5, 5, 6, 6, 6, 7, 7, 7}};
    796     uint8x8_t dup3_indices[3];
    797     for (int i = 0; i < 3; i++) {
    798       dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
    799     }
    800 
    801     // Handle one output pixel at a time.
    802     for (int outp = 0; outp < num_output_pixels; outp++) {
    803       const uint8* local_filter_ptr = filter_ptr;
    804       const uint8* local_input_ptr = input_ptr;
    805       int ic = 0;
    806       // Handle 8 input channels at a time.
    807       for (; ic <= input_depth - 8; ic += 8) {
    808         // Load the filters, add filter_offset.
    809         int16x8_t filter[3];
    810         uint8x8x3_t filter_u8;
    811         filter_u8.val[0] = vld1_u8(local_filter_ptr);
    812         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
    813         filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
    814         local_filter_ptr += 24;
    815         for (int i = 0; i < 3; i++) {
    816           const int16x8_t filter_s16 =
    817               vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
    818           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    819         }
    820         // Load the inputs, duplicate 3-fold, add input_offset.
    821         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
    822         local_input_ptr += 8;
    823 
    824         uint8x8_t input_u8_dup3[3];
    825         for (int i = 0; i < 3; i++) {
    826           input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
    827         }
    828         int16x8_t input_dup3[3];
    829         for (int i = 0; i < 3; i++) {
    830           const int16x8_t input_s16_dup3 =
    831               vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
    832           input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
    833         }
    834         // Load the accumulators from acc_buffer
    835         int32x4x3_t acc[2];
    836         for (int i = 0; i < 2; i++) {
    837           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
    838           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
    839           acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
    840         }
    841         // Multiply-accumulate
    842         for (int j = 0; j < 3; j++) {
    843           acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
    844                                     vget_low_s16(filter[j]));
    845           acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
    846                                     vget_high_s16(filter[j]));
    847         }
    848         // Store the accumulators back to acc_buffer
    849         for (int i = 0; i < 2; i++) {
    850           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
    851           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
    852           vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
    853         }
    854         acc_buffer_ptr += 24;
    855       }
    856       // Handle one input channel at a time.
    857       for (; ic < input_depth; ic++) {
    858         const int16 input_val = *local_input_ptr++ + input_offset;
    859         for (int i = 0; i < 3; i++) {
    860           const int16 filter_val = local_filter_ptr[i] + filter_offset;
    861           *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
    862         }
    863         local_filter_ptr += 3;
    864       }
    865       input_ptr += input_ptr_increment;
    866     }
    867   }
    868 };
    869 
    870 template <>
    871 struct QuantizedDepthwiseConvKernel<true, 0, 2> {
    872   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    873                   const uint8* input_ptr, int16 input_offset,
    874                   int input_ptr_increment, const uint8* filter_ptr,
    875                   int16 filter_offset, int32* acc_buffer_ptr) {
    876     // Handle one output pixel at a time.
    877     for (int outp = 0; outp < num_output_pixels; outp++) {
    878       const uint8* local_filter_ptr = filter_ptr;
    879       const uint8* local_input_ptr = input_ptr;
    880       int ic = 0;
    881       // Handle 8 input channels at a time.
    882       for (; ic <= input_depth - 8; ic += 8) {
    883         // Load the filters, add filter_offset.
    884         int16x8_t filter[2];
    885         uint8x8x2_t filter_u8;
    886         filter_u8.val[0] = vld1_u8(local_filter_ptr);
    887         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
    888         local_filter_ptr += 16;
    889         for (int i = 0; i < 2; i++) {
    890           const int16x8_t filter_s16 =
    891               vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
    892           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    893         }
    894         // Load the inputs, add input_offset, duplicate 2-fold.
    895         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
    896         local_input_ptr += 8;
    897         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    898         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    899         const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    900         // Load the accumulators from acc_buffer.
    901         int32x4x2_t acc[2];
    902         for (int i = 0; i < 2; i++) {
    903           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
    904           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
    905         }
    906         // Multiply-accumulate.
    907         for (int j = 0; j < 2; j++) {
    908           acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
    909                                     vget_low_s16(input_dup2.val[j]));
    910           acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
    911                                     vget_high_s16(input_dup2.val[j]));
    912         }
    913         // Store the accumulators back to acc_buffer.
    914         for (int i = 0; i < 2; i++) {
    915           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
    916           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
    917         }
    918         acc_buffer_ptr += 16;
    919       }
    920       // Handle one input channel at a time.
    921       for (; ic < input_depth; ic++) {
    922         // Load the inputs.
    923         const int16 input_val = *local_input_ptr++ + input_offset;
    924         for (int i = 0; i < 2; i++) {
    925           const int16 filter_val = local_filter_ptr[i] + filter_offset;
    926           *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
    927         }
    928         local_filter_ptr += 2;
    929       }
    930       input_ptr += input_ptr_increment;
    931     }
    932   }
    933 };
    934 
    935 template <>
    936 struct QuantizedDepthwiseConvKernel<true, 0, 1> {
    937   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    938                   const uint8* input_ptr, int16 input_offset,
    939                   int input_ptr_increment, const uint8* filter_ptr,
    940                   int16 filter_offset, int32* acc_buffer_ptr) {
    941     // Handle one output pixel at a time.
    942     for (int outp = 0; outp < num_output_pixels; outp++) {
    943       const uint8* local_filter_ptr = filter_ptr;
    944       const uint8* local_input_ptr = input_ptr;
    945       int ic = 0;
    946       // Handle 16 input channels at a time.
    947       for (; ic <= input_depth - 16; ic += 16) {
    948         // Load the filters, add filter_offset.
    949         uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
    950         uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
    951         local_filter_ptr += 16;
    952         int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
    953         int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
    954         filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
    955         filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
    956         // Load the inputs, add input_offset.
    957         uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
    958         uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
    959         local_input_ptr += 16;
    960         int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
    961         int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
    962         input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
    963         input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
    964         // Load the accumulators from acc_buffer
    965         int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
    966         int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
    967         int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
    968         int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
    969         acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
    970         acc_1 =
    971             vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
    972         acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
    973         acc_3 =
    974             vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
    975         // Store the accumulators back to acc_buffer
    976         vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
    977         vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
    978         vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
    979         vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
    980         acc_buffer_ptr += 16;
    981       }
    982       // Handle 8 input channels at a time.
    983       for (; ic <= input_depth - 8; ic += 8) {
    984         // Load the filters, add filter_offset.
    985         const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
    986         local_filter_ptr += 8;
    987         const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    988         const int16x8_t filter =
    989             vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    990         // Load the inputs, add input_offset.
    991         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
    992         local_input_ptr += 8;
    993         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    994         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    995         // Load the accumulators from acc_buffer
    996         int32x4_t acc[2];
    997         for (int i = 0; i < 2; i++) {
    998           acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    999         }
   1000         // Multiply-accumulate
   1001         acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
   1002         acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
   1003         // Store the accumulators back to acc_buffer
   1004         for (int i = 0; i < 2; i++) {
   1005           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1006         }
   1007         acc_buffer_ptr += 8;
   1008       }
   1009       // Handle one input channel at a time.
   1010       for (; ic < input_depth; ic++) {
   1011         const int16 input_val = *local_input_ptr++ + input_offset;
   1012         const int16 filter_val = *local_filter_ptr++ + filter_offset;
   1013         *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
   1014       }
   1015       input_ptr += input_ptr_increment;
   1016     }
   1017   }
   1018 };
   1019 
   1020 template <>
   1021 struct QuantizedDepthwiseConvKernel<true, 16, 1> {
   1022   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1023                   const uint8* input_ptr, int16 input_offset,
   1024                   int input_ptr_increment, const uint8* filter_ptr,
   1025                   int16 filter_offset, int32* acc_buffer_ptr) {
   1026     // Load the filters, add filter_offset.
   1027     uint8x8_t filter_u8[2];
   1028     for (int i = 0; i < 2; i++) {
   1029       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
   1030     }
   1031     int16x8_t filter[2];
   1032     for (int i = 0; i < 2; i++) {
   1033       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
   1034     }
   1035     for (int i = 0; i < 2; i++) {
   1036       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
   1037     }
   1038     // Handle one output pixel at a time.
   1039     for (int outp = 0; outp < num_output_pixels; outp++) {
   1040       // Load the inputs, add input_offset.
   1041       uint8x8_t input_u8[2];
   1042       for (int i = 0; i < 2; i++) {
   1043         input_u8[i] = vld1_u8(input_ptr + 8 * i);
   1044       }
   1045       input_ptr += input_ptr_increment;
   1046       int16x8_t input[2];
   1047       for (int i = 0; i < 2; i++) {
   1048         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
   1049       }
   1050       for (int i = 0; i < 2; i++) {
   1051         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
   1052       }
   1053       // Load the accumulators from acc_buffer
   1054       int32x4_t acc[4];
   1055       for (int i = 0; i < 4; i++) {
   1056         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1057       }
   1058       // Multiply-accumulate
   1059       for (int i = 0; i < 2; i++) {
   1060         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
   1061                                    vget_low_s16(filter[i]));
   1062         acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
   1063                                    vget_high_s16(filter[i]));
   1064       }
   1065       // Store the accumulators back to acc_buffer
   1066       for (int i = 0; i < 4; i++) {
   1067         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1068       }
   1069       acc_buffer_ptr += 16;
   1070     }
   1071   }
   1072 };
   1073 
   1074 template <>
   1075 struct QuantizedDepthwiseConvKernel<true, 8, 1> {
   1076   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1077                   const uint8* input_ptr, int16 input_offset,
   1078                   int input_ptr_increment, const uint8* filter_ptr,
   1079                   int16 filter_offset, int32* acc_buffer_ptr) {
   1080     // Load the filters, add filter_offset.
   1081     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
   1082     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
   1083     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
   1084     // Handle one output pixel at a time.
   1085     for (int outp = 0; outp < num_output_pixels; outp++) {
   1086       // Load the inputs, add input_offset.
   1087       const uint8x8_t input_u8 = vld1_u8(input_ptr);
   1088       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
   1089       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
   1090       // Load the accumulators from acc_buffer
   1091       int32x4_t acc[2];
   1092       for (int i = 0; i < 2; i++) {
   1093         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1094       }
   1095       // Multiply-accumulate
   1096       acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
   1097       acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
   1098       // Store the accumulators back to acc_buffer
   1099       for (int i = 0; i < 2; i++) {
   1100         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1101       }
   1102       acc_buffer_ptr += 8;
   1103       input_ptr += input_ptr_increment;
   1104     }
   1105   }
   1106 };
   1107 
   1108 template <>
   1109 struct QuantizedDepthwiseConvKernel<true, 1, 16> {
   1110   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1111                   const uint8* input_ptr, int16 input_offset,
   1112                   int input_ptr_increment, const uint8* filter_ptr,
   1113                   int16 filter_offset, int32* acc_buffer_ptr) {
   1114     // Load the filters, add filter_offset.
   1115     uint8x8_t filter_u8[2];
   1116     for (int i = 0; i < 2; i++) {
   1117       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
   1118     }
   1119     int16x8_t filter[2];
   1120     for (int i = 0; i < 2; i++) {
   1121       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
   1122     }
   1123     for (int i = 0; i < 2; i++) {
   1124       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
   1125     }
   1126     // Handle one output pixel at a time.
   1127     for (int outp = 0; outp < num_output_pixels; outp++) {
   1128       uint8 input_u8 = *input_ptr;
   1129       input_ptr += input_ptr_increment;
   1130       int16 input = static_cast<int16>(input_u8 + input_offset);
   1131       // Load the accumulators from acc_buffer
   1132       int32x4_t acc[4];
   1133       for (int i = 0; i < 4; i++) {
   1134         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1135       }
   1136       // Multiply-accumulate
   1137       for (int i = 0; i < 2; i++) {
   1138         acc[2 * i + 0] =
   1139             vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
   1140         acc[2 * i + 1] =
   1141             vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
   1142       }
   1143       // Store the accumulators back to acc_buffer
   1144       for (int i = 0; i < 4; i++) {
   1145         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1146       }
   1147       acc_buffer_ptr += 16;
   1148     }
   1149   }
   1150 };
   1151 
   1152 template <>
   1153 struct QuantizedDepthwiseConvKernel<true, 1, 32> {
   1154   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1155                   const uint8* input_ptr, int16 input_offset,
   1156                   int input_ptr_increment, const uint8* filter_ptr,
   1157                   int16 filter_offset, int32* acc_buffer_ptr) {
   1158     // Load the filters, add filter_offset.
   1159     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
   1160     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
   1161     uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
   1162     uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
   1163     int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
   1164     int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
   1165     int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
   1166     int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
   1167     filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
   1168     filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
   1169     filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
   1170     filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
   1171     // Handle one output pixel at a time.
   1172     for (int outp = 0; outp < num_output_pixels; outp++) {
   1173       uint8 input_u8 = *input_ptr;
   1174       input_ptr += input_ptr_increment;
   1175       int16 input = static_cast<int16>(input_u8 + input_offset);
   1176       // Load the accumulators from acc_buffer
   1177       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
   1178       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
   1179       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
   1180       int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
   1181       int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
   1182       int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
   1183       int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
   1184       int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
   1185       // Multiply-accumulate
   1186       acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
   1187       acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
   1188       acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
   1189       acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
   1190       acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
   1191       acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
   1192       acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
   1193       acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
   1194       // Store the accumulators back to acc_buffer
   1195       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
   1196       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
   1197       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
   1198       vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
   1199       vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
   1200       vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
   1201       vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
   1202       vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
   1203       acc_buffer_ptr += 32;
   1204     }
   1205   }
   1206 };
   1207 
   1208 template <>
   1209 struct QuantizedDepthwiseConvKernel<true, 1, 20> {
   1210   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1211                   const uint8* input_ptr, int16 input_offset,
   1212                   int input_ptr_increment, const uint8* filter_ptr,
   1213                   int16 filter_offset, int32* acc_buffer_ptr) {
   1214     // Load the filters, add filter_offset.
   1215     // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
   1216     // We load the first 16 bytes into filter_u8_{0,1} as usual.
   1217     // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
   1218     // This is redundant: the first 4 bytes of filter_u8_x are the same
   1219     // as the last 4 bytes of filter_u8_x.
   1220     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
   1221     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
   1222     uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
   1223     int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
   1224     int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
   1225     int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
   1226     filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
   1227     filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
   1228     filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
   1229     // Handle one output pixel at a time.
   1230     for (int outp = 0; outp < num_output_pixels; outp++) {
   1231       uint8 input_u8 = *input_ptr;
   1232       input_ptr += input_ptr_increment;
   1233       int16 input = static_cast<int16>(input_u8 + input_offset);
   1234       // Load the accumulators from acc_buffer
   1235       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
   1236       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
   1237       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
   1238       int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
   1239       int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
   1240       // Multiply-accumulate
   1241       acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
   1242       acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
   1243       acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
   1244       acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
   1245       acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
   1246       // Store the accumulators back to acc_buffer
   1247       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
   1248       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
   1249       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
   1250       vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
   1251       vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
   1252       acc_buffer_ptr += 20;
   1253     }
   1254   }
   1255 };
   1256 
   1257 template <>
   1258 struct QuantizedDepthwiseConvKernel<true, 1, 8> {
   1259   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1260                   const uint8* input_ptr, int16 input_offset,
   1261                   int input_ptr_increment, const uint8* filter_ptr,
   1262                   int16 filter_offset, int32* acc_buffer_ptr) {
   1263     // Load the filters, add filter_offset.
   1264     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
   1265     const int16x8_t filter = vaddq_s16(
   1266         vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
   1267     // Handle one output pixel at a time.
   1268     for (int outp = 0; outp < num_output_pixels; outp++) {
   1269       uint8 input_u8 = *input_ptr;
   1270       input_ptr += input_ptr_increment;
   1271       int16 input = static_cast<int16>(input_u8 + input_offset);
   1272       // Load the accumulators from acc_buffer
   1273       int32x4_t acc[2];
   1274       for (int i = 0; i < 2; i++) {
   1275         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1276       }
   1277       // Multiply-accumulate
   1278       acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
   1279       acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
   1280       // Store the accumulators back to acc_buffer
   1281       for (int i = 0; i < 2; i++) {
   1282         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1283       }
   1284       acc_buffer_ptr += 8;
   1285     }
   1286   }
   1287 };
   1288 
   1289 template <>
   1290 struct QuantizedDepthwiseConvKernel<true, 2, 1> {
   1291   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1292                   const uint8* input_ptr, int16 input_offset,
   1293                   int input_ptr_increment, const uint8* filter_ptr,
   1294                   int16 filter_offset, int32* acc_buffer_ptr) {
   1295     // Load the filters, add filter_offset.
   1296     uint8x8_t filter_u8 = vdup_n_u8(0);
   1297     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
   1298     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
   1299     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
   1300     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
   1301     const int16x4_t filter_s16 =
   1302         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
   1303     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
   1304 
   1305     int outp = 0;
   1306 
   1307     // Handle 2 output pixels at a time.
   1308     for (; outp <= num_output_pixels - 2; outp += 2) {
   1309       // Load the accumulators from acc_buffer.
   1310       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
   1311       // Load the inputs, add input_offset.
   1312       uint16x4_t input_u16 = vdup_n_u16(0);
   1313       input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
   1314                                 input_u16, 0);
   1315       input_ptr += input_ptr_increment;
   1316       input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
   1317                                 input_u16, 1);
   1318       input_ptr += input_ptr_increment;
   1319       const int16x4_t input_s16 = vreinterpret_s16_u16(
   1320           vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
   1321       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
   1322 
   1323       // Multiply-accumulate.
   1324       acc = vmlal_s16(acc, filter, input);
   1325       // Store the accumulators back to acc_buffer.
   1326       vst1q_s32(acc_buffer_ptr, acc);
   1327       acc_buffer_ptr += 4;
   1328     }
   1329 
   1330     // Handle 1 output pixel at a time.
   1331     for (; outp < num_output_pixels; outp++) {
   1332       // Load the accumulators from acc_buffer.
   1333       int32x2_t acc = vld1_s32(acc_buffer_ptr);
   1334       // Load the inputs, add input_offset.
   1335       uint8x8_t input_u8 = vdup_n_u8(0);
   1336       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
   1337       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
   1338       input_ptr += input_ptr_increment;
   1339       const int16x4_t input_s16 =
   1340           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
   1341       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
   1342 
   1343       // Multiply-accumulate.
   1344       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
   1345       // Store the accumulators back to acc_buffer.
   1346       vst1_s32(acc_buffer_ptr, acc);
   1347       acc_buffer_ptr += 2;
   1348     }
   1349   }
   1350 };
   1351 
   1352 template <>
   1353 struct QuantizedDepthwiseConvKernel<true, 4, 1> {
   1354   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1355                   const uint8* input_ptr, int16 input_offset,
   1356                   int input_ptr_increment, const uint8* filter_ptr,
   1357                   int16 filter_offset, int32* acc_buffer_ptr) {
   1358     if (num_output_pixels <= 0) {
   1359       return;
   1360     }
   1361 
   1362     // Load the filters, add filter_offset.
   1363     uint8x8_t filter_u8 = vdup_n_u8(0);
   1364     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
   1365     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
   1366     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
   1367     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
   1368     const int16x4_t filter_s16 =
   1369         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
   1370     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
   1371 
   1372     int outp = 0;
   1373 
   1374     // Handle one output pixel at a time until second to the last pixel. Second
   1375     // to the last because we read eight input pixels while only processing
   1376     // four.
   1377     for (; outp < num_output_pixels - 1; outp++) {
   1378       // Load the accumulators from acc_buffer
   1379       int32x4_t acc;
   1380       acc = vld1q_s32(acc_buffer_ptr);
   1381 
   1382       // Load the inputs, add input_offset.
   1383       uint8x8_t input_u8 = vld1_u8(input_ptr);
   1384       input_ptr += input_ptr_increment;
   1385       const int16x4_t input_s16 =
   1386           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
   1387       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
   1388       // Multiply-accumulate
   1389       acc = vmlal_s16(acc, filter, input);
   1390       // Store the accumulators back to acc_buffer
   1391       vst1q_s32(acc_buffer_ptr, acc);
   1392       acc_buffer_ptr += 4;
   1393     }
   1394 
   1395     // Handle the last output pixel.
   1396     // Load the accumulators from acc_buffer
   1397     int32x4_t acc;
   1398     acc = vld1q_s32(acc_buffer_ptr);
   1399 
   1400     // Load the inputs, add input_offset.
   1401     uint8x8_t input_u8 = vdup_n_u8(0);
   1402     input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
   1403     input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
   1404     input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
   1405     input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
   1406     const int16x4_t input_s16 =
   1407         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
   1408     const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
   1409     // Multiply-accumulate
   1410     acc = vmlal_s16(acc, filter, input);
   1411     // Store the accumulators back to acc_buffer
   1412     vst1q_s32(acc_buffer_ptr, acc);
   1413   }
   1414 };
   1415 
   1416 template <>
   1417 struct QuantizedDepthwiseConvKernel<false, 12, 1> {
   1418   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1419                   const uint8* input_ptr, int16 input_offset,
   1420                   int input_ptr_increment, const uint8* filter_ptr,
   1421                   int16 filter_offset, int32* acc_buffer_ptr) {
   1422     // Load the filters, add filter_offset.
   1423     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
   1424     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
   1425     int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
   1426     int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
   1427     filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
   1428     filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
   1429     int16x4_t filter_0 = vget_low_s16(filter_s16_0);
   1430     int16x4_t filter_1 = vget_high_s16(filter_s16_0);
   1431     int16x4_t filter_2 = vget_high_s16(filter_s16_1);
   1432 
   1433     // Handle one output pixel at a time.
   1434     for (int outp = 0; outp < num_output_pixels; outp++) {
   1435       // Load the inputs, add input_offset.
   1436       uint8x8_t input_u8_0 = vld1_u8(input_ptr);
   1437       uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
   1438       input_ptr += input_ptr_increment;
   1439       int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
   1440       int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
   1441       input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
   1442       input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
   1443 
   1444       // Load the accumulators from acc_buffer
   1445       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
   1446       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
   1447       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
   1448 
   1449       // Multiply-accumulate
   1450       acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
   1451       acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
   1452       acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
   1453 
   1454       // Store the accumulators back to acc_buffer
   1455       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
   1456       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
   1457       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
   1458 
   1459       acc_buffer_ptr += 12;
   1460     }
   1461   }
   1462 };
   1463 #endif
   1464 
   1465 // Accumulates the effect of one row of the filter, on a segment of one row
   1466 // of the output, accessing the corresponding one row of the input.
   1467 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
   1468 void QuantizedDepthwiseConvAccumRow(
   1469     int stride, int input_depth, int input_width, const uint8* input_data,
   1470     int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
   1471     const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
   1472     int out_x_buffer_end, int output_depth, int32* acc_buffer) {
   1473 #ifdef GEMMLOWP_PROFILING
   1474   gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
   1475 #endif
   1476   // Sanity check parameters. This is important in particular to ensure
   1477   // that we keep the number of template instantiations minimal, so we don't
   1478   // increase binary size unnecessarily.
   1479   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
   1480   static_assert(kFixedInputDepth || kAllowStrided, "");
   1481   TFLITE_DCHECK(stride == 1 || kAllowStrided);
   1482   if (kFixedInputDepth) {
   1483     TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
   1484   }
   1485   if (kFixedDepthMultiplier) {
   1486     TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
   1487   }
   1488   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
   1489   const int input_ptr_increment = stride * input_depth;
   1490   const uint8* filter_base_ptr = filter_data;
   1491   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
   1492     // For the current (filter_x, filter_y) point in the filter,
   1493     // compute the boundaries of the corresponding output row segment.
   1494     int out_x_loop_start_unclampled = 0;
   1495     int out_x_loop_end_unclampled = 0;
   1496     if (kAllowStrided) {
   1497       if (stride == 2) {
   1498         out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
   1499         out_x_loop_end_unclampled =
   1500             (pad_width + input_width - filter_x + 1) / 2;
   1501       } else if (stride == 4) {
   1502         out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
   1503         out_x_loop_end_unclampled =
   1504             (pad_width + input_width - filter_x + 3) / 4;
   1505       } else {
   1506         out_x_loop_start_unclampled =
   1507             (pad_width - filter_x + stride - 1) / stride;
   1508         out_x_loop_end_unclampled =
   1509             (pad_width + input_width - filter_x + stride - 1) / stride;
   1510       }
   1511     } else {
   1512       out_x_loop_start_unclampled = pad_width - filter_x;
   1513       out_x_loop_end_unclampled = pad_width + input_width - filter_x;
   1514     }
   1515     // The kernel will have to iterate on the segment of the
   1516     // output row that starts at out_x_loop_start and out_x_loop_end.
   1517     const int out_x_loop_start =
   1518         std::max(out_x_buffer_start, out_x_loop_start_unclampled);
   1519     const int out_x_loop_end =
   1520         std::min(out_x_buffer_end, out_x_loop_end_unclampled);
   1521 
   1522     int32* acc_buffer_ptr =
   1523         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
   1524     const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
   1525     const uint8* input_ptr = input_data + in_x_origin * input_depth;
   1526     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
   1527     QuantizedDepthwiseConvKernel<
   1528         kAllowStrided, kFixedInputDepth,
   1529         kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
   1530                                     depth_multiplier, input_ptr, input_offset,
   1531                                     input_ptr_increment, filter_base_ptr,
   1532                                     filter_offset, acc_buffer_ptr);
   1533     filter_base_ptr += output_depth;
   1534   }
   1535 }
   1536 
   1537 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
   1538 inline void QuantizedDepthwiseConvAccumRowGeneric(
   1539     int stride, int input_depth, int input_width, const uint8* input_data,
   1540     int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
   1541     const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
   1542     int out_x_buffer_end, int output_depth, int32* acc_buffer) {
   1543   gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
   1544 #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
   1545 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
   1546   LOG(FATAL)
   1547       << "\n\n"
   1548       << "*****************************************************************\n"
   1549       << "* This tfmini inference code was about to use the slow generic\n"
   1550       << "* fallback implementation for a DepthwiseConv op, and we want you\n"
   1551       << "* to be aware of that so that you will know why you get terrible\n"
   1552       << "* performance.\n"
   1553       << "*\n"
   1554       << "* If you would like to carry on with the slow code, compile\n"
   1555       << "* with this preprocessor token defined:\n"
   1556       << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
   1557       << "*\n"
   1558       << "* The right thing to do, if you care about performance, is to add\n"
   1559       << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
   1560       << "* The relevant parameters defining your case are:\n"
   1561       << "* stride = " << stride << "\n"
   1562       << "* input_depth = " << input_depth << "\n"
   1563       << "* depth_multiplier = " << depth_multiplier << "\n"
   1564       << "*\n"
   1565       << "* Please do not hesitate to contact benoitjacob@ with this\n"
   1566       << "* information.\n"
   1567       << "*****************************************************************\n";
   1568 #endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
   1569 #endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
   1570   const uint8* filter_base_ptr = filter_data;
   1571   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
   1572     const int out_x_loop_start = std::max(
   1573         out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
   1574     const int out_x_loop_end =
   1575         std::min(out_x_buffer_end,
   1576                  (pad_width + input_width - filter_x + stride - 1) / stride);
   1577 
   1578     int32* acc_buffer_ptr =
   1579         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
   1580     const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
   1581     const uint8* input_ptr = input_data + in_x_origin * input_depth;
   1582     const int input_ptr_increment = (stride - 1) * input_depth;
   1583     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
   1584       const uint8* filter_ptr = filter_base_ptr;
   1585       for (int ic = 0; ic < input_depth; ++ic) {
   1586         const int16 input_val = *input_ptr++ + input_offset;
   1587         for (int m = 0; m < depth_multiplier; m++) {
   1588           const int16 filter_val = *filter_ptr++ + filter_offset;
   1589           *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
   1590         }
   1591       }
   1592       input_ptr += input_ptr_increment;
   1593     }
   1594     filter_base_ptr += output_depth;
   1595   }
   1596 }
   1597 
   1598 // Initializes the accumulator buffer with bias values.
   1599 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
   1600                                        const int32* bias_data,
   1601                                        int32* acc_buffer) {
   1602   int i = 0;
   1603 #ifdef USE_NEON
   1604   if (output_depth == 1) {
   1605     const int32x4_t b = vdupq_n_s32(bias_data[0]);
   1606     for (; i <= num_output_pixels - 16; i += 16) {
   1607       vst1q_s32(acc_buffer + i + 0, b);
   1608       vst1q_s32(acc_buffer + i + 4, b);
   1609       vst1q_s32(acc_buffer + i + 8, b);
   1610       vst1q_s32(acc_buffer + i + 12, b);
   1611     }
   1612     for (; i <= num_output_pixels - 4; i += 4) {
   1613       vst1q_s32(acc_buffer + i, b);
   1614     }
   1615   } else if (output_depth == 2) {
   1616     int32x4_t b = vdupq_n_s32(bias_data[0]);
   1617     b = vsetq_lane_s32(bias_data[1], b, 1);
   1618     b = vsetq_lane_s32(bias_data[1], b, 3);
   1619     for (; i <= num_output_pixels - 8; i += 8) {
   1620       vst1q_s32(acc_buffer + 2 * i + 0, b);
   1621       vst1q_s32(acc_buffer + 2 * i + 4, b);
   1622       vst1q_s32(acc_buffer + 2 * i + 8, b);
   1623       vst1q_s32(acc_buffer + 2 * i + 12, b);
   1624     }
   1625     for (; i <= num_output_pixels - 2; i += 2) {
   1626       vst1q_s32(acc_buffer + 2 * i, b);
   1627     }
   1628   } else if (output_depth == 4) {
   1629     const int32x4_t b = vld1q_s32(bias_data);
   1630     for (; i <= num_output_pixels - 4; i += 4) {
   1631       vst1q_s32(acc_buffer + 4 * i + 0, b);
   1632       vst1q_s32(acc_buffer + 4 * i + 4, b);
   1633       vst1q_s32(acc_buffer + 4 * i + 8, b);
   1634       vst1q_s32(acc_buffer + 4 * i + 12, b);
   1635     }
   1636     for (; i < num_output_pixels; i++) {
   1637       vst1q_s32(acc_buffer + 4 * i, b);
   1638     }
   1639   } else if (output_depth == 8) {
   1640     const int32x4_t b0 = vld1q_s32(bias_data);
   1641     const int32x4_t b1 = vld1q_s32(bias_data + 4);
   1642     for (; i <= num_output_pixels - 2; i += 2) {
   1643       vst1q_s32(acc_buffer + 8 * i + 0, b0);
   1644       vst1q_s32(acc_buffer + 8 * i + 4, b1);
   1645       vst1q_s32(acc_buffer + 8 * i + 8, b0);
   1646       vst1q_s32(acc_buffer + 8 * i + 12, b1);
   1647     }
   1648     for (; i < num_output_pixels; i++) {
   1649       vst1q_s32(acc_buffer + 8 * i + 0, b0);
   1650       vst1q_s32(acc_buffer + 8 * i + 4, b1);
   1651     }
   1652   } else if (output_depth == 16) {
   1653     const int32x4_t b0 = vld1q_s32(bias_data);
   1654     const int32x4_t b1 = vld1q_s32(bias_data + 4);
   1655     const int32x4_t b2 = vld1q_s32(bias_data + 8);
   1656     const int32x4_t b3 = vld1q_s32(bias_data + 12);
   1657     for (; i < num_output_pixels; i++) {
   1658       vst1q_s32(acc_buffer + 16 * i + 0, b0);
   1659       vst1q_s32(acc_buffer + 16 * i + 4, b1);
   1660       vst1q_s32(acc_buffer + 16 * i + 8, b2);
   1661       vst1q_s32(acc_buffer + 16 * i + 12, b3);
   1662     }
   1663   }
   1664 #endif
   1665   for (; i < num_output_pixels; i++) {
   1666     memcpy(acc_buffer + i * output_depth, bias_data,
   1667            sizeof(acc_buffer[0]) * output_depth);
   1668   }
   1669 }
   1670 
   1671 inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   1672                           int32 input_offset, const uint8* filter_data,
   1673                           const Dims<4>& filter_dims, int32 filter_offset,
   1674                           const int32* bias_data, const Dims<4>& bias_dims,
   1675                           int stride_width, int stride_height, int pad_width,
   1676                           int pad_height, int depth_multiplier,
   1677                           int32 output_offset, int32 output_multiplier,
   1678                           int output_shift, int32 output_activation_min,
   1679                           int32 output_activation_max, uint8* output_data,
   1680                           const Dims<4>& output_dims) {
   1681   gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
   1682   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   1683 
   1684   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1685   const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
   1686   const int input_height = ArraySize(input_dims, 2);
   1687   const int input_width = ArraySize(input_dims, 1);
   1688   const int input_depth = ArraySize(input_dims, 0);
   1689   const int filter_height = ArraySize(filter_dims, 2);
   1690   const int filter_width = ArraySize(filter_dims, 1);
   1691   const int output_height = ArraySize(output_dims, 2);
   1692   const int output_width = ArraySize(output_dims, 1);
   1693   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
   1694 
   1695   static const int kAccBufferMaxSize = 2048;
   1696   int32 acc_buffer[kAccBufferMaxSize];
   1697   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
   1698   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
   1699   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
   1700   TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
   1701                    kAccBufferActualSize);
   1702   TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
   1703   TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
   1704 
   1705   // row_accum_func will point to the core accumulation function to be used
   1706   // for this DepthwiseConv op.
   1707   using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
   1708   row_accum_func_t row_accum_func = nullptr;
   1709 
   1710 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
   1711                                         FIXED_DEPTH_MULTIPLIER)           \
   1712   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
   1713       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
   1714       depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
   1715     row_accum_func =                                                      \
   1716         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
   1717                                        FIXED_DEPTH_MULTIPLIER>;           \
   1718   }
   1719 
   1720 #ifdef USE_NEON
   1721   // We go over our list of kernels by decreasing order of preference
   1722   // for the cases where multiple kernels could apply.
   1723 
   1724   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
   1725 
   1726   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
   1727   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
   1728   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
   1729   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
   1730   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
   1731   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
   1732   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
   1733   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
   1734   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
   1735   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
   1736 
   1737   // Next come the strided kernels: AllowStrided=true, fixed input depth.
   1738   // They are a bit less efficient, but allow stride!=1.
   1739 
   1740   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
   1741   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
   1742   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
   1743   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
   1744   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
   1745   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
   1746   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
   1747   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
   1748   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
   1749 
   1750   // Finally, the kernels allowing a variable input depth,
   1751   // these are the least efficient but most general kernels.
   1752 
   1753   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
   1754   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
   1755   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
   1756 #endif  // USE_NEON
   1757 
   1758   // No matching fast kernel found, use slow fallback.
   1759   if (!row_accum_func) {
   1760     row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
   1761   }
   1762 
   1763 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
   1764 
   1765   // Now that we have determined row_accum_func, we can start work.
   1766   uint8* output_ptr = output_data;
   1767   for (int b = 0; b < batches; ++b) {
   1768     for (int out_y = 0; out_y < output_height; ++out_y) {
   1769       const int in_y_origin = (out_y * stride_height) - pad_height;
   1770       const int filter_y_start = std::max(0, -in_y_origin);
   1771       const int filter_y_end =
   1772           std::min(filter_height, input_height - in_y_origin);
   1773       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
   1774            out_x_buffer_start += kOutputPixelsInAccBuffer) {
   1775         const int out_x_buffer_end = std::min(
   1776             output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
   1777         // We call a 'pixel' a group of activation that share all but the
   1778         // 'depth'/'channel' coordinate. num_output_pixels is the number of
   1779         // output pixels that we will accumulate in this loop iteration.
   1780         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
   1781         // Initialize our local accumulator with the bias values, so we don't
   1782         // have to add them later.
   1783         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
   1784                                    acc_buffer);
   1785         // Accumulation loop. Most of the time should be spent in here.
   1786         for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1787              ++filter_y) {
   1788           const int in_y = in_y_origin + filter_y;
   1789           row_accum_func(
   1790               stride_width, input_depth, input_width,
   1791               input_data + in_y * input_dims.strides[2] +
   1792                   b * input_dims.strides[3],
   1793               input_offset, pad_width, depth_multiplier, filter_width,
   1794               filter_data + filter_y * filter_dims.strides[2], filter_offset,
   1795               out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
   1796         }
   1797         // Finished accumulating int32 values. Now need to convert them to
   1798         // the final 8bit form and store them.
   1799         gemmlowp::ScopedProfilingLabel label("downquantize+store");
   1800         const int num_output_values = output_depth * num_output_pixels;
   1801         int i = 0;
   1802 #ifdef USE_NEON
   1803         using gemmlowp::RoundingDivideByPOT;
   1804         const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
   1805         const int32x4_t output_activation_min_vec =
   1806             vdupq_n_s32(output_activation_min);
   1807         const int32x4_t output_activation_max_vec =
   1808             vdupq_n_s32(output_activation_max);
   1809         // Handle 16 values at once.
   1810         // This allows us to issue 4 mutually independent int32
   1811         // multiplications (vqrdmulh), which should alleviate most of their
   1812         // high latency.
   1813         for (; i <= num_output_values - 16; i += 16) {
   1814           int32x4_t acc[4];
   1815           for (int j = 0; j < 4; j++) {
   1816             acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
   1817           }
   1818 
   1819           // Fixed-point multiplication.
   1820           for (int j = 0; j < 4; j++) {
   1821             acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
   1822           }
   1823           for (int j = 0; j < 4; j++) {
   1824             acc[j] = RoundingDivideByPOT(acc[j], output_shift);
   1825           }
   1826           // Add the output offset.
   1827           for (int j = 0; j < 4; j++) {
   1828             acc[j] = vaddq_s32(acc[j], output_offset_vec);
   1829           }
   1830           // Apply the activation function.
   1831           for (int j = 0; j < 4; j++) {
   1832             acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
   1833           }
   1834           for (int j = 0; j < 4; j++) {
   1835             acc[j] = vminq_s32(acc[j], output_activation_max_vec);
   1836           }
   1837           // Saturating cast to uint8 and store to destination.
   1838           int16x4_t acc_s16[4];
   1839           for (int j = 0; j < 4; j++) {
   1840             acc_s16[j] = vqmovn_s32(acc[j]);
   1841           }
   1842           const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
   1843           const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
   1844           const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
   1845           const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
   1846           vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
   1847           output_ptr += 16;
   1848         }
   1849         // Handle 8 values at once.
   1850         // Not as good as 16 (now we're only issuing 2 mutually independent
   1851         // vqrdmulh instructions, so we're probably paying for their high
   1852         // latency).
   1853         for (; i <= num_output_values - 8; i += 8) {
   1854           int32x4_t acc0 = vld1q_s32(acc_buffer + i);
   1855           int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
   1856           // Fixed-point multiplication.
   1857           acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
   1858           acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
   1859           // Rounding right shift.
   1860           acc0 = RoundingDivideByPOT(acc0, output_shift);
   1861           acc1 = RoundingDivideByPOT(acc1, output_shift);
   1862           // Add the output offset.
   1863           acc0 = vaddq_s32(acc0, output_offset_vec);
   1864           acc1 = vaddq_s32(acc1, output_offset_vec);
   1865           // Apply the activation function.
   1866           acc0 = vmaxq_s32(acc0, output_activation_min_vec);
   1867           acc1 = vmaxq_s32(acc1, output_activation_min_vec);
   1868           acc0 = vminq_s32(acc0, output_activation_max_vec);
   1869           acc1 = vminq_s32(acc1, output_activation_max_vec);
   1870           // Saturating cast to uint8 and store to destination.
   1871           const int16x4_t acc0_s16 = vqmovn_s32(acc0);
   1872           const int16x4_t acc1_s16 = vqmovn_s32(acc1);
   1873           const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
   1874           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
   1875           vst1_u8(output_ptr, res_u8);
   1876           output_ptr += 8;
   1877         }
   1878         // Handle 4 values at once. Now we're paying the full price of the
   1879         // high latency of vqrdmulh. Also, storing only 4 bytes at the end
   1880         // (without any alignment) can only be done 1 byte at a time.
   1881         // Yet, that is still worth doing to minimize the amount of leftover
   1882         // that will have to go through the very slow scalar code.
   1883         for (; i <= num_output_values - 4; i += 4) {
   1884           int32x4_t acc = vld1q_s32(acc_buffer + i);
   1885           // Fixed-point multiplication.
   1886           acc = vqrdmulhq_n_s32(acc, output_multiplier);
   1887           // Rounding right shift.
   1888           acc = RoundingDivideByPOT(acc, output_shift);
   1889           // Add the output offset.
   1890           acc = vaddq_s32(acc, output_offset_vec);
   1891           // Apply the activation function.
   1892           acc = vmaxq_s32(acc, output_activation_min_vec);
   1893           acc = vminq_s32(acc, output_activation_max_vec);
   1894           // Saturating cast to uint8 and store to destination.
   1895           const int16x4_t acc_s16 = vqmovn_s32(acc);
   1896           const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
   1897           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
   1898           vst1_lane_u8(output_ptr + 0, res_u8, 0);
   1899           vst1_lane_u8(output_ptr + 1, res_u8, 1);
   1900           vst1_lane_u8(output_ptr + 2, res_u8, 2);
   1901           vst1_lane_u8(output_ptr + 3, res_u8, 3);
   1902           output_ptr += 4;
   1903         }
   1904 #endif  // USE_NEON
   1905 
   1906         // Handle leftover values, one by one. This is very slow.
   1907         for (; i < num_output_values; i++) {
   1908           int32 acc = acc_buffer[i];
   1909           acc = MultiplyByQuantizedMultiplierSmallerThanOne(
   1910               acc, output_multiplier, output_shift);
   1911           acc += output_offset;
   1912           acc = std::max(acc, output_activation_min);
   1913           acc = std::min(acc, output_activation_max);
   1914           *output_ptr++ = static_cast<uint8>(acc);
   1915         }
   1916       }
   1917     }
   1918   }
   1919 }
   1920 
   1921 // Legacy, for compatibility with old checked-in code.
   1922 template <FusedActivationFunctionType Ac>
   1923 void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   1924                    int32 input_offset, const uint8* filter_data,
   1925                    const Dims<4>& filter_dims, int32 filter_offset,
   1926                    const int32* bias_data, const Dims<4>& bias_dims,
   1927                    int stride_width, int stride_height, int pad_width,
   1928                    int pad_height, int depth_multiplier, int32 output_offset,
   1929                    int32 output_multiplier, int output_shift,
   1930                    int32 output_activation_min, int32 output_activation_max,
   1931                    uint8* output_data, const Dims<4>& output_dims) {
   1932   if (Ac == FusedActivationFunctionType::kNone) {
   1933     TFLITE_DCHECK_EQ(output_activation_min, 0);
   1934     TFLITE_DCHECK_EQ(output_activation_max, 255);
   1935   }
   1936   DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
   1937                 filter_offset, bias_data, bias_dims, stride_width,
   1938                 stride_height, pad_width, pad_height, depth_multiplier,
   1939                 output_offset, output_multiplier, output_shift,
   1940                 output_activation_min, output_activation_max, output_data,
   1941                 output_dims);
   1942 }
   1943 
   1944 // Legacy, for compatibility with old checked-in code.
   1945 template <FusedActivationFunctionType Ac>
   1946 void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   1947                    int32 input_offset, const uint8* filter_data,
   1948                    const Dims<4>& filter_dims, int32 filter_offset,
   1949                    const int32* bias_data, const Dims<4>& bias_dims, int stride,
   1950                    int pad_width, int pad_height, int depth_multiplier,
   1951                    int32 output_offset, int32 output_multiplier,
   1952                    int output_shift, int32 output_activation_min,
   1953                    int32 output_activation_max, uint8* output_data,
   1954                    const Dims<4>& output_dims) {
   1955   DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
   1956                     filter_dims, filter_offset, bias_data, bias_dims, stride,
   1957                     stride, pad_width, pad_height, depth_multiplier,
   1958                     output_offset, output_multiplier, output_shift,
   1959                     output_activation_min, output_activation_max, output_data,
   1960                     output_dims);
   1961 }
   1962 
   1963 }  // namespace optimized_ops
   1964 }  // namespace tflite
   1965 
   1966 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
   1967