Home | History | Annotate | Download | only in optimized
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ANDROID_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
     18 #define ANDROID_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
     19 
     20 #include "fixedpoint.h"
     21 #include "gemmlowp.h"
     22 #include "../common.h"
     23 #include "../types.h"
     24 
     25 namespace android {
     26 namespace nn {
     27 namespace optimized_ops {
     28 
     29 // Implementation of quantized DepthwiseConv
     30 
     31 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
     32 struct QuantizedDepthwiseConvKernel {};
     33 
     34 #ifdef USE_NEON
     35 template <>
     36 struct QuantizedDepthwiseConvKernel<true, 8, 2> {
     37   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
     38                   const uint8* input_ptr, int16 input_offset,
     39                   int input_ptr_increment, const uint8* filter_ptr,
     40                   int16 filter_offset, int32* acc_buffer_ptr) {
     41     // Load the filters, add filter_offset.
     42     uint8x8x2_t filter_u8;
     43     filter_u8.val[0] = vld1_u8(filter_ptr);
     44     filter_u8.val[1] = vld1_u8(filter_ptr + 8);
     45     int16x8_t filter[2];
     46     for (int i = 0; i < 2; i++) {
     47       filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
     48                             vdupq_n_s16(filter_offset));
     49     }
     50     // Handle one output pixel at a time.
     51     for (int outp = 0; outp < num_output_pixels; outp++) {
     52       // Load the accumulators from acc_buffer
     53       int32x4x2_t acc[2];
     54       for (int i = 0; i < 2; i++) {
     55         acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
     56         acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
     57       }
     58       // Load the inputs, add input_offset.
     59       const uint8x8_t input_u8 = vld1_u8(input_ptr);
     60       input_ptr += input_ptr_increment;
     61       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
     62       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
     63       // Duplicate the input values, 2-fold
     64       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
     65       // Multiply-accumulate
     66       for (int i = 0; i < 2; i++) {
     67         acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
     68                                   vget_low_s16(input_dup2.val[i]));
     69         acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
     70                                   vget_high_s16(input_dup2.val[i]));
     71       }
     72       // Store the accumulators back to acc_buffer
     73       for (int i = 0; i < 2; i++) {
     74         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
     75         vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
     76       }
     77       acc_buffer_ptr += 16;
     78     }
     79   }
     80 };
     81 
     82 template <>
     83 struct QuantizedDepthwiseConvKernel<false, 8, 1> {
     84   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
     85                   const uint8* input_ptr, int16 input_offset,
     86                   int input_ptr_increment, const uint8* filter_ptr,
     87                   int16 filter_offset, int32* acc_buffer_ptr) {
     88     // Load the filters, add filter_offset.
     89     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
     90     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
     91     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
     92 
     93     int outp = 0;
     94     // Handle 2 output pixels at a time.
     95     for (; outp <= num_output_pixels - 2; outp += 2) {
     96       // Load the accumulators from acc_buffer.
     97       int32x4_t acc[4];
     98       for (int i = 0; i < 4; i++) {
     99         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    100       }
    101       // Load the inputs, add input_offset.
    102       uint8x8_t input_u8[2];
    103       for (int i = 0; i < 2; i++) {
    104         input_u8[i] = vld1_u8(input_ptr + 8 * i);
    105       }
    106       input_ptr += 16;
    107       int16x8_t input[2];
    108       for (int i = 0; i < 2; i++) {
    109         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
    110       }
    111       for (int i = 0; i < 2; i++) {
    112         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
    113       }
    114       // Multiply-accumulate.
    115       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
    116       acc[1] =
    117           vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
    118       acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
    119       acc[3] =
    120           vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
    121       // Store the accumulators back to acc_buffer
    122       for (int i = 0; i < 4; i++) {
    123         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    124       }
    125       acc_buffer_ptr += 16;
    126     }
    127     // Handle 1 output pixel at a time.
    128     for (; outp < num_output_pixels; outp++) {
    129       // Load the accumulators from acc_buffer.
    130       int32x4_t acc[2];
    131       acc[0] = vld1q_s32(acc_buffer_ptr);
    132       acc[1] = vld1q_s32(acc_buffer_ptr + 4);
    133 
    134       // Load the inputs, add input_offset.
    135       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    136       input_ptr += 8;
    137       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    138       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    139       // Multiply-accumulate.
    140       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
    141       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
    142       // Store the accumulators back to acc_buffer
    143       vst1q_s32(acc_buffer_ptr, acc[0]);
    144       vst1q_s32(acc_buffer_ptr + 4, acc[1]);
    145       acc_buffer_ptr += 8;
    146     }
    147   }
    148 };
    149 
    150 template <>
    151 struct QuantizedDepthwiseConvKernel<false, 4, 2> {
    152   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    153                   const uint8* input_ptr, int16 input_offset,
    154                   int input_ptr_increment, const uint8* filter_ptr,
    155                   int16 filter_offset, int32* acc_buffer_ptr) {
    156     // Load the filters, add filter_offset.
    157     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
    158     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    159     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    160 
    161     int outp = 0;
    162     // Handle 2 output pixels at a time.
    163     for (; outp <= num_output_pixels - 2; outp += 2) {
    164       // Load the accumulators from acc_buffer
    165       int32x4_t acc[4];
    166       for (int i = 0; i < 4; i++) {
    167         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    168       }
    169       // Load the inputs, add input_offset.
    170       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    171       input_ptr += 8;
    172       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    173       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    174       // Duplicate the input values, 2-fold
    175       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    176       // Multiply-accumulate
    177       for (int i = 0; i < 2; i++) {
    178         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
    179                                    vget_low_s16(input_dup2.val[i]));
    180         acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
    181                                    vget_high_s16(input_dup2.val[i]));
    182       }
    183       // Store the accumulators back to acc_buffer
    184       for (int i = 0; i < 4; i++) {
    185         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    186       }
    187       acc_buffer_ptr += 16;
    188     }
    189     // Handle one output pixel at a time.
    190     for (; outp < num_output_pixels; outp++) {
    191       // Load the accumulators from acc_buffer
    192       int32x4_t acc[2];
    193       for (int i = 0; i < 2; i++) {
    194         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    195       }
    196       // Load the inputs, add input_offset.
    197       uint8x8_t input_u8 = vdup_n_u8(0);
    198       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    199       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    200       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    201       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    202       input_ptr += 4;
    203       const int16x4_t input_s16 =
    204           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    205       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    206       // Duplicate the input values, 2-fold
    207       const int16x4x2_t input_dup2 = vzip_s16(input, input);
    208       // Multiply-accumulate
    209       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
    210       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
    211       // Store the accumulators back to acc_buffer
    212       for (int i = 0; i < 2; i++) {
    213         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    214       }
    215       acc_buffer_ptr += 8;
    216     }
    217   }
    218 };
    219 
    220 template <>
    221 struct QuantizedDepthwiseConvKernel<false, 2, 8> {
    222   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    223                   const uint8* input_ptr, int16 input_offset,
    224                   int input_ptr_increment, const uint8* filter_ptr,
    225                   int16 filter_offset, int32* acc_buffer_ptr) {
    226     // Load the filters, add filter_offset.
    227     int16x8_t filter[2];
    228     for (int i = 0; i < 2; i++) {
    229       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
    230       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    231       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    232     }
    233     int outp = 0;
    234     // Handle two output pixels at a time.
    235     for (; outp <= num_output_pixels - 2; outp += 2) {
    236       // Load the accumulators from acc_buffer.
    237       int32x4_t acc[8];
    238       for (int i = 0; i < 8; i++) {
    239         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    240       }
    241       // Load the inputs, add input_offset.
    242       uint8x8_t input_u8 = vdup_n_u8(0);
    243       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    244       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    245       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    246       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    247       input_ptr += 4;
    248       const int16x4_t input_s16 =
    249           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    250       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    251       // Multiply-accumulate.
    252       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
    253       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
    254       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
    255       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
    256       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
    257       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
    258       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
    259       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
    260       // Store the accumulators back to acc_buffer.
    261       for (int i = 0; i < 8; i++) {
    262         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    263       }
    264       acc_buffer_ptr += 32;
    265     }
    266     // Handle one output pixel at a time.
    267     for (; outp < num_output_pixels; outp++) {
    268       // Load the accumulators from acc_buffer.
    269       int32x4_t acc[4];
    270       for (int i = 0; i < 4; i++) {
    271         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    272       }
    273       // Load the inputs, add input_offset.
    274       uint8x8_t input_u8 = vdup_n_u8(0);
    275       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    276       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    277       input_ptr += 2;
    278       const int16x4_t input_s16 =
    279           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    280       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    281 
    282       // Multiply-accumulate.
    283       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
    284       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
    285       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
    286       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
    287 
    288       // Store the accumulators back to acc_buffer.
    289       for (int i = 0; i < 4; i++) {
    290         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    291       }
    292       acc_buffer_ptr += 16;
    293     }
    294   }
    295 };
    296 
    297 template <>
    298 struct QuantizedDepthwiseConvKernel<false, 2, 2> {
    299   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    300                   const uint8* input_ptr, int16 input_offset,
    301                   int input_ptr_increment, const uint8* filter_ptr,
    302                   int16 filter_offset, int32* acc_buffer_ptr) {
    303     // Load the filters, add filter_offset.
    304     uint8x8_t filter_u8 = vdup_n_u8(0);
    305     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    306     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    307     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    308     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    309     const int16x4_t filter_s16 =
    310         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    311     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    312 
    313     int outp = 0;
    314     // Handle 4 output pixels at a time.
    315     for (; outp <= num_output_pixels - 4; outp += 4) {
    316       // Load the accumulators from acc_buffer
    317       int32x4_t acc[4];
    318       for (int i = 0; i < 4; i++) {
    319         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    320       }
    321 
    322       // Load the inputs, add input_offset.
    323       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    324       input_ptr += 8;
    325       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    326       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    327       // Duplicate the input values, 2-fold
    328       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    329       // Multiply-accumulate
    330       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
    331       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
    332       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
    333       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
    334       // Store the accumulators back to acc_buffer
    335       for (int i = 0; i < 4; i++) {
    336         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    337       }
    338       acc_buffer_ptr += 16;
    339     }
    340     // Handle one output pixel at a time.
    341     for (; outp < num_output_pixels; outp++) {
    342       // Load the accumulators from acc_buffer
    343       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
    344 
    345       uint8x8_t input_u8 = vdup_n_u8(0);
    346       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    347       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    348       input_ptr += 2;
    349       const int16x4_t input_s16 =
    350           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    351       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    352       // Duplicate the input values, 2-fold
    353       const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
    354       // Multiply-accumulate
    355       acc = vmlal_s16(acc, filter, input_dup2);
    356       // Store the accumulators back to acc_buffer
    357       vst1q_s32(acc_buffer_ptr, acc);
    358       acc_buffer_ptr += 4;
    359     }
    360   }
    361 };
    362 
    363 template <>
    364 struct QuantizedDepthwiseConvKernel<false, 2, 1> {
    365   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    366                   const uint8* input_ptr, int16 input_offset,
    367                   int input_ptr_increment, const uint8* filter_ptr,
    368                   int16 filter_offset, int32* acc_buffer_ptr) {
    369     // Load the filters, add filter_offset.
    370     uint8x8_t filter_u8 = vdup_n_u8(0);
    371     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    372     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    373     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    374     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    375     const int16x4_t filter_s16 =
    376         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    377     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    378 
    379     int outp = 0;
    380     // Handle 8 output pixels at a time.
    381     for (; outp <= num_output_pixels - 8; outp += 8) {
    382       // Load the accumulators from acc_buffer.
    383       int32x4_t acc[4];
    384       for (int i = 0; i < 4; i++) {
    385         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    386       }
    387       // Load the inputs, add input_offset.
    388       uint8x8_t input_u8[2];
    389       for (int i = 0; i < 2; i++) {
    390         input_u8[i] = vld1_u8(input_ptr + 8 * i);
    391       }
    392       input_ptr += 16;
    393       int16x8_t input[2];
    394       for (int i = 0; i < 2; i++) {
    395         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
    396       }
    397       for (int i = 0; i < 2; i++) {
    398         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
    399       }
    400 
    401       // Multiply-accumulate.
    402       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
    403       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
    404       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
    405       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
    406       // Store the accumulators back to acc_buffer.
    407       for (int i = 0; i < 4; i++) {
    408         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    409       }
    410       acc_buffer_ptr += 16;
    411     }
    412     // Handle 4 output pixels at a time.
    413     for (; outp <= num_output_pixels - 4; outp += 4) {
    414       // Load the accumulators from acc_buffer.
    415       int32x4_t acc[2];
    416       for (int i = 0; i < 2; i++) {
    417         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    418       }
    419       // Load the inputs, add input_offset.
    420       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    421       input_ptr += 8;
    422       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    423       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    424 
    425       // Multiply-accumulate.
    426       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
    427       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
    428       // Store the accumulators back to acc_buffer.
    429       for (int i = 0; i < 2; i++) {
    430         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    431       }
    432       acc_buffer_ptr += 8;
    433     }
    434     // Handle 2 output pixels at a time.
    435     for (; outp <= num_output_pixels - 2; outp += 2) {
    436       // Load the accumulators from acc_buffer.
    437       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
    438       // Load the inputs, add input_offset.
    439       uint8x8_t input_u8 = vdup_n_u8(0);
    440       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    441       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    442       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    443       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    444       input_ptr += 4;
    445       const int16x4_t input_s16 =
    446           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    447       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    448 
    449       // Multiply-accumulate.
    450       acc = vmlal_s16(acc, filter, input);
    451       // Store the accumulators back to acc_buffer.
    452       vst1q_s32(acc_buffer_ptr, acc);
    453       acc_buffer_ptr += 4;
    454     }
    455     // Handle 1 output pixel at a time.
    456     for (; outp < num_output_pixels; outp++) {
    457       // Load the accumulators from acc_buffer.
    458       int32x2_t acc = vld1_s32(acc_buffer_ptr);
    459       // Load the inputs, add input_offset.
    460       uint8x8_t input_u8 = vdup_n_u8(0);
    461       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    462       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    463       input_ptr += 2;
    464       const int16x4_t input_s16 =
    465           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    466       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    467 
    468       // Multiply-accumulate.
    469       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
    470       // Store the accumulators back to acc_buffer.
    471       vst1_s32(acc_buffer_ptr, acc);
    472       acc_buffer_ptr += 2;
    473     }
    474   }
    475 };
    476 
    477 template <>
    478 struct QuantizedDepthwiseConvKernel<false, 1, 2> {
    479   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    480                   const uint8* input_ptr, int16 input_offset,
    481                   int input_ptr_increment, const uint8* filter_ptr,
    482                   int16 filter_offset, int32* acc_buffer_ptr) {
    483     // Load the filters, add filter_offset.
    484     uint8x8_t filter_u8 = vdup_n_u8(0);
    485     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    486     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    487     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
    488     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
    489     const int16x4_t filter_s16 =
    490         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    491     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    492 
    493     int outp = 0;
    494     // Handle 8 output pixels at a time.
    495     for (; outp <= num_output_pixels - 8; outp += 8) {
    496       // Load the accumulators from acc_buffer
    497       int32x4_t acc[4];
    498       for (int i = 0; i < 4; i++) {
    499         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    500       }
    501 
    502       // Load the inputs, add input_offset.
    503       const uint8x8_t input_u8 = vld1_u8(input_ptr);
    504       input_ptr += 8;
    505       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    506       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    507       // Duplicate the input values, 2-fold
    508       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    509       // Multiply-accumulate
    510       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
    511       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
    512       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
    513       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
    514       // Store the accumulators back to acc_buffer
    515       for (int i = 0; i < 4; i++) {
    516         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    517       }
    518       acc_buffer_ptr += 16;
    519     }
    520     // Handle one output pixel at a time.
    521     for (; outp < num_output_pixels; outp++) {
    522       // Load the accumulators from acc_buffer
    523       int32x2_t acc = vld1_s32(acc_buffer_ptr);
    524 
    525       // Load the inputs, add input_offset.
    526       const uint32 input = *input_ptr++ + input_offset;
    527 
    528       // Multiply-accumulate
    529       acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
    530       // Store the accumulators back to acc_buffer
    531       vst1_s32(acc_buffer_ptr, acc);
    532       acc_buffer_ptr += 2;
    533     }
    534   }
    535 };
    536 
    537 template <>
    538 struct QuantizedDepthwiseConvKernel<false, 1, 4> {
    539   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    540                   const uint8* input_ptr, int16 input_offset,
    541                   int input_ptr_increment, const uint8* filter_ptr,
    542                   int16 filter_offset, int32* acc_buffer_ptr) {
    543     // Load the filters, add filter_offset.
    544     uint8x8_t filter_u8 = vdup_n_u8(0);
    545     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    546     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    547     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    548     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    549     const int16x4_t filter_s16 =
    550         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    551     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    552 
    553     int outp = 0;
    554     // Handle 8 output pixels at a time.
    555     for (; outp <= num_output_pixels - 8; outp += 8) {
    556       // Load the accumulators from acc_buffer
    557       int32x4_t acc[8];
    558       for (int i = 0; i < 8; i++) {
    559         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    560       }
    561 
    562       // Load the inputs, add input_offset.
    563       uint8x8_t input_u8 = vld1_u8(input_ptr);
    564       input_ptr += 8;
    565       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    566       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    567 
    568       // Multiply-accumulate
    569       acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
    570       acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
    571       acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
    572       acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
    573       acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
    574       acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
    575       acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
    576       acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
    577 
    578       // Store the accumulators back to acc_buffer
    579       for (int i = 0; i < 8; i++) {
    580         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    581       }
    582       acc_buffer_ptr += 32;
    583     }
    584     // Handle 4 output pixels at a time.
    585     for (; outp <= num_output_pixels - 4; outp += 4) {
    586       // Load the accumulators from acc_buffer
    587       int32x4_t acc[4];
    588       for (int i = 0; i < 4; i++) {
    589         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    590       }
    591 
    592       // Load the inputs, add input_offset.
    593       uint8x8_t input_u8 = vdup_n_u8(0);
    594       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    595       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    596       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    597       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    598       input_ptr += 4;
    599       const int16x4_t input_s16 =
    600           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    601       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    602 
    603       // Multiply-accumulate
    604       acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
    605       acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
    606       acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
    607       acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
    608 
    609       // Store the accumulators back to acc_buffer
    610       for (int i = 0; i < 4; i++) {
    611         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    612       }
    613       acc_buffer_ptr += 16;
    614     }
    615     // Handle one output pixel at a time.
    616     for (; outp < num_output_pixels; outp++) {
    617       // Load the accumulators from acc_buffer
    618       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
    619 
    620       // Load the inputs, add input_offset.
    621       const uint32 input = *input_ptr++ + input_offset;
    622 
    623       // Multiply-accumulate
    624       acc = vmlal_n_s16(acc, filter, input);
    625       // Store the accumulators back to acc_buffer
    626       vst1q_s32(acc_buffer_ptr, acc);
    627       acc_buffer_ptr += 4;
    628     }
    629   }
    630 };
    631 
    632 template <>
    633 struct QuantizedDepthwiseConvKernel<false, 4, 1> {
    634   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    635                   const uint8* input_ptr, int16 input_offset,
    636                   int input_ptr_increment, const uint8* filter_ptr,
    637                   int16 filter_offset, int32* acc_buffer_ptr) {
    638     // Load the filters, add filter_offset.
    639     uint8x8_t filter_u8 = vdup_n_u8(0);
    640     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
    641     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
    642     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
    643     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
    644     const int16x4_t filter_s16 =
    645         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
    646     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
    647 
    648     int outp = 0;
    649     // Handle 4 output pixels at a time.
    650     for (; outp <= num_output_pixels - 4; outp += 4) {
    651       // Load the accumulators from acc_buffer
    652       int32x4_t acc[4];
    653       for (int i = 0; i < 4; i++) {
    654         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    655       }
    656       // Load the inputs, add input_offset.
    657       int16x8_t input[2];
    658       for (int i = 0; i < 2; i++) {
    659         const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
    660         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    661         input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    662       }
    663       input_ptr += 16;
    664       // Multiply-accumulate
    665       for (int i = 0; i < 2; i++) {
    666         acc[2 * i + 0] =
    667             vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
    668         acc[2 * i + 1] =
    669             vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
    670       }
    671       // Store the accumulators back to acc_buffer
    672       for (int i = 0; i < 4; i++) {
    673         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    674       }
    675       acc_buffer_ptr += 16;
    676     }
    677     // Handle one output pixel at a time.
    678     for (; outp < num_output_pixels; outp++) {
    679       // Load the accumulators from acc_buffer
    680       int32x4_t acc;
    681       acc = vld1q_s32(acc_buffer_ptr);
    682 
    683       // Load the inputs, add input_offset.
    684       uint8x8_t input_u8 = vdup_n_u8(0);
    685       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    686       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    687       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    688       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    689       input_ptr += 4;
    690       const int16x4_t input_s16 =
    691           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    692       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    693       // Multiply-accumulate
    694       acc = vmlal_s16(acc, filter, input);
    695       // Store the accumulators back to acc_buffer
    696       vst1q_s32(acc_buffer_ptr, acc);
    697       acc_buffer_ptr += 4;
    698     }
    699   }
    700 };
    701 
    702 template <>
    703 struct QuantizedDepthwiseConvKernel<false, 4, 4> {
    704   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    705                   const uint8* input_ptr, int16 input_offset,
    706                   int input_ptr_increment, const uint8* filter_ptr,
    707                   int16 filter_offset, int32* acc_buffer_ptr) {
    708     // Load the filters, add filter_offset.
    709     int16x8_t filter[2];
    710     for (int i = 0; i < 2; i++) {
    711       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
    712       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
    713       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    714     }
    715 
    716     int outp = 0;
    717     // Handle 2 output pixels at a time.
    718     for (; outp <= num_output_pixels - 2; outp += 2) {
    719       // Load the accumulators from acc_buffer
    720       int32x4_t acc[8];
    721       for (int i = 0; i < 8; i++) {
    722         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    723       }
    724 
    725       // Load the inputs, add input_offset.
    726       uint8x8_t input_u8 = vld1_u8(input_ptr);
    727       input_ptr += 8;
    728       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    729       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    730 
    731       // Multiply-accumulate
    732       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
    733                               vget_low_s16(input), 0);
    734       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
    735                               vget_low_s16(input), 1);
    736       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
    737                               vget_low_s16(input), 2);
    738       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
    739                               vget_low_s16(input), 3);
    740       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
    741                               vget_high_s16(input), 0);
    742       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
    743                               vget_high_s16(input), 1);
    744       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
    745                               vget_high_s16(input), 2);
    746       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
    747                               vget_high_s16(input), 3);
    748       // Store the accumulators back to acc_buffer
    749       for (int i = 0; i < 8; i++) {
    750         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    751       }
    752       acc_buffer_ptr += 32;
    753     }
    754     // Handle one output pixel at a time.
    755     for (; outp < num_output_pixels; outp++) {
    756       // Load the accumulators from acc_buffer
    757       int32x4_t acc[4];
    758       for (int i = 0; i < 4; i++) {
    759         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    760       }
    761 
    762       // Load the inputs, add input_offset.
    763       uint8x8_t input_u8 = vdup_n_u8(0);
    764       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
    765       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
    766       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
    767       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
    768       input_ptr += 4;
    769       const int16x4_t input_s16 =
    770           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
    771       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
    772 
    773       // Multiply-accumulate
    774       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
    775       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
    776       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
    777       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
    778       // Store the accumulators back to acc_buffer
    779       for (int i = 0; i < 4; i++) {
    780         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    781       }
    782       acc_buffer_ptr += 16;
    783     }
    784   }
    785 };
    786 
    787 template <>
    788 struct QuantizedDepthwiseConvKernel<true, 0, 3> {
    789   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    790                   const uint8* input_ptr, int16 input_offset,
    791                   int input_ptr_increment, const uint8* filter_ptr,
    792                   int16 filter_offset, int32* acc_buffer_ptr) {
    793     // We will have to duplicate bytes in a NEON register, 3-fold.
    794     // We will do that by register-level table-look-up using VTBL instructions.
    795     // Here we prepare the registers containing the table-lookup indices.
    796     static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
    797                                                    {2, 3, 3, 3, 4, 4, 4, 5},
    798                                                    {5, 5, 6, 6, 6, 7, 7, 7}};
    799     uint8x8_t dup3_indices[3];
    800     for (int i = 0; i < 3; i++) {
    801       dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
    802     }
    803 
    804     // Handle one output pixel at a time.
    805     for (int outp = 0; outp < num_output_pixels; outp++) {
    806       const uint8* local_filter_ptr = filter_ptr;
    807       const uint8* local_input_ptr = input_ptr;
    808       int ic = 0;
    809       // Handle 8 input channels at a time.
    810       for (; ic <= input_depth - 8; ic += 8) {
    811         // Load the filters, add filter_offset.
    812         int16x8_t filter[3];
    813         uint8x8x3_t filter_u8;
    814         filter_u8.val[0] = vld1_u8(local_filter_ptr);
    815         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
    816         filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
    817         local_filter_ptr += 24;
    818         for (int i = 0; i < 3; i++) {
    819           const int16x8_t filter_s16 =
    820               vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
    821           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    822         }
    823         // Load the inputs, duplicate 3-fold, add input_offset.
    824         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
    825         local_input_ptr += 8;
    826 
    827         uint8x8_t input_u8_dup3[3];
    828         for (int i = 0; i < 3; i++) {
    829           input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
    830         }
    831         int16x8_t input_dup3[3];
    832         for (int i = 0; i < 3; i++) {
    833           const int16x8_t input_s16_dup3 =
    834               vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
    835           input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
    836         }
    837         // Load the accumulators from acc_buffer
    838         int32x4x3_t acc[2];
    839         for (int i = 0; i < 2; i++) {
    840           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
    841           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
    842           acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
    843         }
    844         // Multiply-accumulate
    845         for (int j = 0; j < 3; j++) {
    846           acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
    847                                     vget_low_s16(filter[j]));
    848           acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
    849                                     vget_high_s16(filter[j]));
    850         }
    851         // Store the accumulators back to acc_buffer
    852         for (int i = 0; i < 2; i++) {
    853           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
    854           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
    855           vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
    856         }
    857         acc_buffer_ptr += 24;
    858       }
    859       // Handle one input channel at a time.
    860       for (; ic < input_depth; ic++) {
    861         const int16 input_val = *local_input_ptr++ + input_offset;
    862         for (int i = 0; i < 3; i++) {
    863           const int16 filter_val = local_filter_ptr[i] + filter_offset;
    864           *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
    865         }
    866         local_filter_ptr += 3;
    867       }
    868       input_ptr += input_ptr_increment;
    869     }
    870   }
    871 };
    872 
    873 template <>
    874 struct QuantizedDepthwiseConvKernel<true, 0, 2> {
    875   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    876                   const uint8* input_ptr, int16 input_offset,
    877                   int input_ptr_increment, const uint8* filter_ptr,
    878                   int16 filter_offset, int32* acc_buffer_ptr) {
    879     // Handle one output pixel at a time.
    880     for (int outp = 0; outp < num_output_pixels; outp++) {
    881       const uint8* local_filter_ptr = filter_ptr;
    882       const uint8* local_input_ptr = input_ptr;
    883       int ic = 0;
    884       // Handle 8 input channels at a time.
    885       for (; ic <= input_depth - 8; ic += 8) {
    886         // Load the filters, add filter_offset.
    887         int16x8_t filter[2];
    888         uint8x8x2_t filter_u8;
    889         filter_u8.val[0] = vld1_u8(local_filter_ptr);
    890         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
    891         local_filter_ptr += 16;
    892         for (int i = 0; i < 2; i++) {
    893           const int16x8_t filter_s16 =
    894               vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
    895           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
    896         }
    897         // Load the inputs, add input_offset, duplicate 2-fold.
    898         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
    899         local_input_ptr += 8;
    900         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
    901         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
    902         const int16x8x2_t input_dup2 = vzipq_s16(input, input);
    903         // Load the accumulators from acc_buffer.
    904         int32x4x2_t acc[2];
    905         for (int i = 0; i < 2; i++) {
    906           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
    907           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
    908         }
    909         // Multiply-accumulate.
    910         for (int j = 0; j < 2; j++) {
    911           acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
    912                                     vget_low_s16(input_dup2.val[j]));
    913           acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
    914                                     vget_high_s16(input_dup2.val[j]));
    915         }
    916         // Store the accumulators back to acc_buffer.
    917         for (int i = 0; i < 2; i++) {
    918           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
    919           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
    920         }
    921         acc_buffer_ptr += 16;
    922       }
    923       // Handle one input channel at a time.
    924       for (; ic < input_depth; ic++) {
    925         // Load the inputs.
    926         const int16 input_val = *local_input_ptr++ + input_offset;
    927         for (int i = 0; i < 2; i++) {
    928           const int16 filter_val = local_filter_ptr[i] + filter_offset;
    929           *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
    930         }
    931         local_filter_ptr += 2;
    932       }
    933       input_ptr += input_ptr_increment;
    934     }
    935   }
    936 };
    937 
    938 template <>
    939 struct QuantizedDepthwiseConvKernel<true, 0, 1> {
    940   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    941                   const uint8* input_ptr, int16 input_offset,
    942                   int input_ptr_increment, const uint8* filter_ptr,
    943                   int16 filter_offset, int32* acc_buffer_ptr) {
    944     // Handle one output pixel at a time.
    945     for (int outp = 0; outp < num_output_pixels; outp++) {
    946       const uint8* local_filter_ptr = filter_ptr;
    947       const uint8* local_input_ptr = input_ptr;
    948       int ic = 0;
    949       // Handle 16 input channels at a time.
    950       for (; ic <= input_depth - 16; ic += 16) {
    951         // Load the filters, add filter_offset.
    952         uint8x8_t filter_u8[2];
    953         for (int i = 0; i < 2; i++) {
    954           filter_u8[i] = vld1_u8(local_filter_ptr + 8 * i);
    955         }
    956         local_filter_ptr += 16;
    957         int16x8_t filter[2];
    958         for (int i = 0; i < 2; i++) {
    959           filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
    960         }
    961         for (int i = 0; i < 2; i++) {
    962           filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
    963         }
    964         // Load the inputs, add input_offset.
    965         uint8x8_t input_u8[2];
    966         for (int i = 0; i < 2; i++) {
    967           input_u8[i] = vld1_u8(local_input_ptr + 8 * i);
    968         }
    969         local_input_ptr += 16;
    970         int16x8_t input[2];
    971         for (int i = 0; i < 2; i++) {
    972           input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
    973         }
    974         for (int i = 0; i < 2; i++) {
    975           input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
    976         }
    977         // Load the accumulators from acc_buffer
    978         int32x4_t acc[4];
    979         for (int i = 0; i < 4; i++) {
    980           acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
    981         }
    982         // Multiply-accumulate
    983         for (int i = 0; i < 2; i++) {
    984           acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
    985                                      vget_low_s16(filter[i]));
    986           acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
    987                                      vget_high_s16(filter[i]));
    988         }
    989         // Store the accumulators back to acc_buffer
    990         for (int i = 0; i < 4; i++) {
    991           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
    992         }
    993         acc_buffer_ptr += 16;
    994       }
    995       // Handle 8 input channels at a time.
    996       for (; ic <= input_depth - 8; ic += 8) {
    997         // Load the filters, add filter_offset.
    998         const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
    999         local_filter_ptr += 8;
   1000         const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
   1001         const int16x8_t filter =
   1002             vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
   1003         // Load the inputs, add input_offset.
   1004         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
   1005         local_input_ptr += 8;
   1006         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
   1007         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
   1008         // Load the accumulators from acc_buffer
   1009         int32x4_t acc[2];
   1010         for (int i = 0; i < 2; i++) {
   1011           acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1012         }
   1013         // Multiply-accumulate
   1014         acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
   1015         acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
   1016         // Store the accumulators back to acc_buffer
   1017         for (int i = 0; i < 2; i++) {
   1018           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1019         }
   1020         acc_buffer_ptr += 8;
   1021       }
   1022       // Handle one input channel at a time.
   1023       for (; ic < input_depth; ic++) {
   1024         const int16 input_val = *local_input_ptr++ + input_offset;
   1025         const int16 filter_val = *local_filter_ptr++ + filter_offset;
   1026         *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
   1027       }
   1028       input_ptr += input_ptr_increment;
   1029     }
   1030   }
   1031 };
   1032 
   1033 template <>
   1034 struct QuantizedDepthwiseConvKernel<true, 16, 1> {
   1035   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1036                   const uint8* input_ptr, int16 input_offset,
   1037                   int input_ptr_increment, const uint8* filter_ptr,
   1038                   int16 filter_offset, int32* acc_buffer_ptr) {
   1039     // Load the filters, add filter_offset.
   1040     uint8x8_t filter_u8[2];
   1041     for (int i = 0; i < 2; i++) {
   1042       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
   1043     }
   1044     int16x8_t filter[2];
   1045     for (int i = 0; i < 2; i++) {
   1046       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
   1047     }
   1048     for (int i = 0; i < 2; i++) {
   1049       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
   1050     }
   1051     // Handle one output pixel at a time.
   1052     for (int outp = 0; outp < num_output_pixels; outp++) {
   1053       // Load the inputs, add input_offset.
   1054       uint8x8_t input_u8[2];
   1055       for (int i = 0; i < 2; i++) {
   1056         input_u8[i] = vld1_u8(input_ptr + 8 * i);
   1057       }
   1058       input_ptr += input_ptr_increment;
   1059       int16x8_t input[2];
   1060       for (int i = 0; i < 2; i++) {
   1061         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
   1062       }
   1063       for (int i = 0; i < 2; i++) {
   1064         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
   1065       }
   1066       // Load the accumulators from acc_buffer
   1067       int32x4_t acc[4];
   1068       for (int i = 0; i < 4; i++) {
   1069         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1070       }
   1071       // Multiply-accumulate
   1072       for (int i = 0; i < 2; i++) {
   1073         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
   1074                                    vget_low_s16(filter[i]));
   1075         acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
   1076                                    vget_high_s16(filter[i]));
   1077       }
   1078       // Store the accumulators back to acc_buffer
   1079       for (int i = 0; i < 4; i++) {
   1080         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1081       }
   1082       acc_buffer_ptr += 16;
   1083     }
   1084   }
   1085 };
   1086 
   1087 template <>
   1088 struct QuantizedDepthwiseConvKernel<true, 1, 16> {
   1089   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1090                   const uint8* input_ptr, int16 input_offset,
   1091                   int input_ptr_increment, const uint8* filter_ptr,
   1092                   int16 filter_offset, int32* acc_buffer_ptr) {
   1093     // Load the filters, add filter_offset.
   1094     uint8x8_t filter_u8[2];
   1095     for (int i = 0; i < 2; i++) {
   1096       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
   1097     }
   1098     int16x8_t filter[2];
   1099     for (int i = 0; i < 2; i++) {
   1100       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
   1101     }
   1102     for (int i = 0; i < 2; i++) {
   1103       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
   1104     }
   1105     // Handle one output pixel at a time.
   1106     for (int outp = 0; outp < num_output_pixels; outp++) {
   1107       uint8 input_u8 = *input_ptr;
   1108       input_ptr += input_ptr_increment;
   1109       int16 input = static_cast<int16>(input_u8 + input_offset);
   1110       // Load the accumulators from acc_buffer
   1111       int32x4_t acc[4];
   1112       for (int i = 0; i < 4; i++) {
   1113         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1114       }
   1115       // Multiply-accumulate
   1116       for (int i = 0; i < 2; i++) {
   1117         acc[2 * i + 0] =
   1118             vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
   1119         acc[2 * i + 1] =
   1120             vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
   1121       }
   1122       // Store the accumulators back to acc_buffer
   1123       for (int i = 0; i < 4; i++) {
   1124         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1125       }
   1126       acc_buffer_ptr += 16;
   1127     }
   1128   }
   1129 };
   1130 
   1131 template <>
   1132 struct QuantizedDepthwiseConvKernel<true, 1, 8> {
   1133   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
   1134                   const uint8* input_ptr, int16 input_offset,
   1135                   int input_ptr_increment, const uint8* filter_ptr,
   1136                   int16 filter_offset, int32* acc_buffer_ptr) {
   1137     // Load the filters, add filter_offset.
   1138     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
   1139     const int16x8_t filter = vaddq_s16(
   1140         vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
   1141     // Handle one output pixel at a time.
   1142     for (int outp = 0; outp < num_output_pixels; outp++) {
   1143       uint8 input_u8 = *input_ptr;
   1144       input_ptr += input_ptr_increment;
   1145       int16 input = static_cast<int16>(input_u8 + input_offset);
   1146       // Load the accumulators from acc_buffer
   1147       int32x4_t acc[2];
   1148       for (int i = 0; i < 2; i++) {
   1149         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
   1150       }
   1151       // Multiply-accumulate
   1152       acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
   1153       acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
   1154       // Store the accumulators back to acc_buffer
   1155       for (int i = 0; i < 2; i++) {
   1156         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
   1157       }
   1158       acc_buffer_ptr += 8;
   1159     }
   1160   }
   1161 };
   1162 #endif
   1163 
   1164 // Accumulates the effect of one row of the filter, on a segment of one row
   1165 // of the output, accessing the corresponding one row of the input.
   1166 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
   1167 void QuantizedDepthwiseConvAccumRow(
   1168     int stride, int input_depth, int input_width, const uint8* input_data,
   1169     int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
   1170     const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
   1171     int out_x_buffer_end, int output_depth, int32* acc_buffer) {
   1172 #ifdef GEMMLOWP_PROFILING
   1173   gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
   1174 #endif
   1175   // Sanity check parameters. This is important in particular to ensure
   1176   // that we keep the number of template instantiations minimal, so we don't
   1177   // increase binary size unnecessarily.
   1178   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
   1179   static_assert(kFixedInputDepth || kAllowStrided, "");
   1180   DCHECK(stride == 1 || kAllowStrided);
   1181   if (kFixedInputDepth) {
   1182     DCHECK_EQ(input_depth, kFixedInputDepth);
   1183   }
   1184   if (kFixedDepthMultiplier) {
   1185     DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
   1186   }
   1187   DCHECK_EQ(output_depth, input_depth * depth_multiplier);
   1188   const int input_ptr_increment = stride * input_depth;
   1189   const uint8* filter_base_ptr = filter_data;
   1190   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
   1191     // For the current (filter_x, filter_y) point in the filter,
   1192     // compute the boundaries of the corresponding output row segment.
   1193     int out_x_loop_start_unclampled = 0;
   1194     int out_x_loop_end_unclampled = 0;
   1195     if (kAllowStrided) {
   1196       if (stride == 2) {
   1197         out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
   1198         out_x_loop_end_unclampled =
   1199             (pad_width + input_width - filter_x + 1) / 2;
   1200       } else if (stride == 4) {
   1201         out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
   1202         out_x_loop_end_unclampled =
   1203             (pad_width + input_width - filter_x + 3) / 4;
   1204       } else {
   1205         out_x_loop_start_unclampled =
   1206             (pad_width - filter_x + stride - 1) / stride;
   1207         out_x_loop_end_unclampled =
   1208             (pad_width + input_width - filter_x + stride - 1) / stride;
   1209       }
   1210     } else {
   1211       out_x_loop_start_unclampled = pad_width - filter_x;
   1212       out_x_loop_end_unclampled = pad_width + input_width - filter_x;
   1213     }
   1214     // The kernel will have to iterate on the segment of the
   1215     // output row that starts at out_x_loop_start and out_x_loop_end.
   1216     const int out_x_loop_start =
   1217         std::max(out_x_buffer_start, out_x_loop_start_unclampled);
   1218     const int out_x_loop_end =
   1219         std::min(out_x_buffer_end, out_x_loop_end_unclampled);
   1220 
   1221     int32* acc_buffer_ptr =
   1222         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
   1223     const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
   1224     const uint8* input_ptr = input_data + in_x_origin * input_depth;
   1225     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
   1226     QuantizedDepthwiseConvKernel<
   1227         kAllowStrided, kFixedInputDepth,
   1228         kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
   1229                                     depth_multiplier, input_ptr, input_offset,
   1230                                     input_ptr_increment, filter_base_ptr,
   1231                                     filter_offset, acc_buffer_ptr);
   1232     filter_base_ptr += output_depth;
   1233   }
   1234 }
   1235 
   1236 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
   1237 inline void QuantizedDepthwiseConvAccumRowGeneric(
   1238     int stride, int input_depth, int input_width, const uint8* input_data,
   1239     int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
   1240     const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
   1241     int out_x_buffer_end, int output_depth, int32* acc_buffer) {
   1242   gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
   1243   const uint8* filter_base_ptr = filter_data;
   1244   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
   1245     const int out_x_loop_start = std::max(
   1246         out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
   1247     const int out_x_loop_end =
   1248         std::min(out_x_buffer_end,
   1249                  (pad_width + input_width - filter_x + stride - 1) / stride);
   1250 
   1251     int32* acc_buffer_ptr =
   1252         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
   1253     const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
   1254     const uint8* input_ptr = input_data + in_x_origin * input_depth;
   1255     const int input_ptr_increment = (stride - 1) * input_depth;
   1256     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
   1257       const uint8* filter_ptr = filter_base_ptr;
   1258       for (int ic = 0; ic < input_depth; ++ic) {
   1259         const int16 input_val = *input_ptr++ + input_offset;
   1260         for (int m = 0; m < depth_multiplier; m++) {
   1261           const int16 filter_val = *filter_ptr++ + filter_offset;
   1262           *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
   1263         }
   1264       }
   1265       input_ptr += input_ptr_increment;
   1266     }
   1267     filter_base_ptr += output_depth;
   1268   }
   1269 }
   1270 
   1271 // Initializes the accumulator buffer with bias values.
   1272 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
   1273                                        const int32* bias_data,
   1274                                        int32* acc_buffer) {
   1275   int i = 0;
   1276 #ifdef USE_NEON
   1277   if (output_depth == 1) {
   1278     const int32x4_t b = vdupq_n_s32(bias_data[0]);
   1279     for (; i <= num_output_pixels - 16; i += 16) {
   1280       vst1q_s32(acc_buffer + i + 0, b);
   1281       vst1q_s32(acc_buffer + i + 4, b);
   1282       vst1q_s32(acc_buffer + i + 8, b);
   1283       vst1q_s32(acc_buffer + i + 12, b);
   1284     }
   1285     for (; i <= num_output_pixels - 4; i += 4) {
   1286       vst1q_s32(acc_buffer + i, b);
   1287     }
   1288   } else if (output_depth == 2) {
   1289     int32x4_t b = vdupq_n_s32(bias_data[0]);
   1290     b = vsetq_lane_s32(bias_data[1], b, 1);
   1291     b = vsetq_lane_s32(bias_data[1], b, 3);
   1292     for (; i <= num_output_pixels - 8; i += 8) {
   1293       vst1q_s32(acc_buffer + 2 * i + 0, b);
   1294       vst1q_s32(acc_buffer + 2 * i + 4, b);
   1295       vst1q_s32(acc_buffer + 2 * i + 8, b);
   1296       vst1q_s32(acc_buffer + 2 * i + 12, b);
   1297     }
   1298     for (; i <= num_output_pixels - 2; i += 2) {
   1299       vst1q_s32(acc_buffer + 2 * i, b);
   1300     }
   1301   } else if (output_depth == 4) {
   1302     const int32x4_t b = vld1q_s32(bias_data);
   1303     for (; i <= num_output_pixels - 4; i += 4) {
   1304       vst1q_s32(acc_buffer + 4 * i + 0, b);
   1305       vst1q_s32(acc_buffer + 4 * i + 4, b);
   1306       vst1q_s32(acc_buffer + 4 * i + 8, b);
   1307       vst1q_s32(acc_buffer + 4 * i + 12, b);
   1308     }
   1309     for (; i < num_output_pixels; i++) {
   1310       vst1q_s32(acc_buffer + 4 * i, b);
   1311     }
   1312   } else if (output_depth == 8) {
   1313     const int32x4_t b0 = vld1q_s32(bias_data);
   1314     const int32x4_t b1 = vld1q_s32(bias_data + 4);
   1315     for (; i <= num_output_pixels - 2; i += 2) {
   1316       vst1q_s32(acc_buffer + 8 * i + 0, b0);
   1317       vst1q_s32(acc_buffer + 8 * i + 4, b1);
   1318       vst1q_s32(acc_buffer + 8 * i + 8, b0);
   1319       vst1q_s32(acc_buffer + 8 * i + 12, b1);
   1320     }
   1321     for (; i < num_output_pixels; i++) {
   1322       vst1q_s32(acc_buffer + 8 * i + 0, b0);
   1323       vst1q_s32(acc_buffer + 8 * i + 4, b1);
   1324     }
   1325   } else if (output_depth == 16) {
   1326     const int32x4_t b0 = vld1q_s32(bias_data);
   1327     const int32x4_t b1 = vld1q_s32(bias_data + 4);
   1328     const int32x4_t b2 = vld1q_s32(bias_data + 8);
   1329     const int32x4_t b3 = vld1q_s32(bias_data + 12);
   1330     for (; i < num_output_pixels; i++) {
   1331       vst1q_s32(acc_buffer + 16 * i + 0, b0);
   1332       vst1q_s32(acc_buffer + 16 * i + 4, b1);
   1333       vst1q_s32(acc_buffer + 16 * i + 8, b2);
   1334       vst1q_s32(acc_buffer + 16 * i + 12, b3);
   1335     }
   1336   }
   1337 #endif
   1338   for (; i < num_output_pixels; i++) {
   1339     memcpy(acc_buffer + i * output_depth, bias_data,
   1340            sizeof(acc_buffer[0]) * output_depth);
   1341   }
   1342 }
   1343 
   1344 template <FusedActivationFunctionType Ac>
   1345 void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   1346                    int32 input_offset, const uint8* filter_data,
   1347                    const Dims<4>& filter_dims, int32 filter_offset,
   1348                    const int32* bias_data, const Dims<4>& bias_dims,
   1349                    int stride_width, int stride_height,
   1350                    int pad_width, int pad_height, int depth_multiplier,
   1351                    int32 output_offset, int32 output_multiplier,
   1352                    int output_shift, int32 output_activation_min,
   1353                    int32 output_activation_max, uint8* output_data,
   1354                    const Dims<4>& output_dims) {
   1355   gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
   1356   static_assert(Ac == FusedActivationFunctionType::kNone ||
   1357                     Ac == FusedActivationFunctionType::kRelu ||
   1358                     Ac == FusedActivationFunctionType::kRelu6 ||
   1359                     Ac == FusedActivationFunctionType::kRelu1,
   1360                 "");
   1361   DCHECK_LE(output_activation_min, output_activation_max);
   1362   if (Ac == FusedActivationFunctionType::kNone) {
   1363     DCHECK_EQ(output_activation_min, 0);
   1364     DCHECK_EQ(output_activation_max, 255);
   1365   }
   1366   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   1367   const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
   1368   const int input_height = ArraySize(input_dims, 2);
   1369   const int input_width = ArraySize(input_dims, 1);
   1370   const int input_depth = ArraySize(input_dims, 0);
   1371   const int filter_height = ArraySize(filter_dims, 2);
   1372   const int filter_width = ArraySize(filter_dims, 1);
   1373   const int output_height = ArraySize(output_dims, 2);
   1374   const int output_width = ArraySize(output_dims, 1);
   1375   DCHECK(output_depth == input_depth * depth_multiplier);
   1376 
   1377   static const int kAccBufferMaxSize = 1024;
   1378   int32 acc_buffer[kAccBufferMaxSize];
   1379   DCHECK_GE(kAccBufferMaxSize, output_depth);
   1380   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
   1381   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
   1382   DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
   1383   DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
   1384   DCHECK_GE(kOutputPixelsInAccBuffer, 1);
   1385 
   1386   // row_accum_func will point to the core accumulation function to be used
   1387   // for this DepthwiseConv op.
   1388   auto* row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
   1389 
   1390   const int kMaxFixedDepthMultiplier = 16;
   1391   int fixed_depth_multiplier = 0;
   1392   if (depth_multiplier <= kMaxFixedDepthMultiplier) {
   1393     fixed_depth_multiplier = depth_multiplier;
   1394   }
   1395   // kMaxUnrolling is the max number of output values that we aim to handle
   1396   // in one unrolled iteration of the inner loop. For practical performance
   1397   // reasons, it is limited by the number of available registers. We could
   1398   // fine-tune it depending on the architecture, but that's not worth doing
   1399   // since this whole code is not very optimized to begin with. The
   1400   // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
   1401   // vector registers.
   1402   const int kMaxUnrolling = 16;
   1403   int fixed_input_depth = 0;
   1404   if (fixed_depth_multiplier &&
   1405       input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
   1406     fixed_input_depth = input_depth;
   1407   }
   1408 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
   1409                                         FIXED_DEPTH_MULTIPLIER)           \
   1410   if ((stride_width == 1 || ALLOW_STRIDED) &&                             \
   1411       fixed_input_depth == FIXED_INPUT_DEPTH &&                           \
   1412       fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                 \
   1413     row_accum_func =                                                      \
   1414         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
   1415                                        FIXED_DEPTH_MULTIPLIER>;           \
   1416   }
   1417 
   1418 #ifdef USE_NEON
   1419   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
   1420   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
   1421   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
   1422   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
   1423   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
   1424   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
   1425   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
   1426   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
   1427   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
   1428   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
   1429   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
   1430   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
   1431   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
   1432   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
   1433   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
   1434   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
   1435 #endif  // USE_NEON
   1436 
   1437 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
   1438 
   1439   // Now that we have determined row_accum_func, we can start work.
   1440   uint8* output_ptr = output_data;
   1441   for (int b = 0; b < batches; ++b) {
   1442     for (int out_y = 0; out_y < output_height; ++out_y) {
   1443       const int in_y_origin = (out_y * stride_height) - pad_height;
   1444       const int filter_y_start = std::max(0, -in_y_origin);
   1445       const int filter_y_end =
   1446           std::min(filter_height, input_height - in_y_origin);
   1447       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
   1448            out_x_buffer_start += kOutputPixelsInAccBuffer) {
   1449         const int out_x_buffer_end = std::min(
   1450             output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
   1451         // We call a 'pixel' a group of activation that share all but the
   1452         // 'depth'/'channel' coordinate. num_output_pixels is the number of
   1453         // output pixels that we will accumulate in this loop iteration.
   1454         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
   1455         // Initialize our local accumulator with the bias values, so we don't
   1456         // have to add them later.
   1457         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
   1458                                    acc_buffer);
   1459         // Accumulation loop. Most of the time should be spent in here.
   1460         for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1461              ++filter_y) {
   1462           const int in_y = in_y_origin + filter_y;
   1463           row_accum_func(
   1464               stride_width, input_depth, input_width,
   1465               input_data + in_y * input_dims.strides[2] +
   1466                   b * input_dims.strides[3],
   1467               input_offset, pad_width, depth_multiplier, filter_width,
   1468               filter_data + filter_y * filter_dims.strides[2], filter_offset,
   1469               out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
   1470         }
   1471         // Finished accumulating int32 values. Now need to convert them to
   1472         // the final 8bit form and store them.
   1473         gemmlowp::ScopedProfilingLabel label("downquantize+store");
   1474         const int num_output_values = output_depth * num_output_pixels;
   1475         int i = 0;
   1476 #ifdef USE_NEON
   1477         using gemmlowp::RoundingDivideByPOT;
   1478         const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
   1479         const int32x4_t output_activation_min_vec =
   1480             vdupq_n_s32(output_activation_min);
   1481         const int32x4_t output_activation_max_vec =
   1482             vdupq_n_s32(output_activation_max);
   1483         // Handle 16 values at once.
   1484         // This allows us to issue 4 mutually independent int32
   1485         // multiplications (vqrdmulh), which should alleviate most of their
   1486         // high latency.
   1487         for (; i <= num_output_values - 16; i += 16) {
   1488           int32x4_t acc[4];
   1489           for (int j = 0; j < 4; j++) {
   1490             acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
   1491           }
   1492 
   1493           // Fixed-point multiplication.
   1494           for (int j = 0; j < 4; j++) {
   1495             acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
   1496           }
   1497           for (int j = 0; j < 4; j++) {
   1498             acc[j] = RoundingDivideByPOT(acc[j], output_shift);
   1499           }
   1500           // Add the output offset.
   1501           for (int j = 0; j < 4; j++) {
   1502             acc[j] = vaddq_s32(acc[j], output_offset_vec);
   1503           }
   1504           // Apply the activation function.
   1505           if (Ac != FusedActivationFunctionType::kNone) {
   1506             for (int j = 0; j < 4; j++) {
   1507               acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
   1508             }
   1509             for (int j = 0; j < 4; j++) {
   1510               acc[j] = vminq_s32(acc[j], output_activation_max_vec);
   1511             }
   1512           }
   1513           // Saturating cast to uint8 and store to destination.
   1514           int16x4_t acc_s16[4];
   1515           for (int j = 0; j < 4; j++) {
   1516             acc_s16[j] = vqmovn_s32(acc[j]);
   1517           }
   1518           const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
   1519           const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
   1520           const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
   1521           const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
   1522           vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
   1523           output_ptr += 16;
   1524         }
   1525         // Handle 8 values at once.
   1526         // Not as good as 16 (now we're only issuing 2 mutually independent
   1527         // vqrdmulh instructions, so we're probably paying for their high
   1528         // latency).
   1529         for (; i <= num_output_values - 8; i += 8) {
   1530           int32x4_t acc0 = vld1q_s32(acc_buffer + i);
   1531           int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
   1532           // Fixed-point multiplication.
   1533           acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
   1534           acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
   1535           // Rounding right shift.
   1536           acc0 = RoundingDivideByPOT(acc0, output_shift);
   1537           acc1 = RoundingDivideByPOT(acc1, output_shift);
   1538           // Add the output offset.
   1539           acc0 = vaddq_s32(acc0, output_offset_vec);
   1540           acc1 = vaddq_s32(acc1, output_offset_vec);
   1541           // Apply the activation function.
   1542           if (Ac != FusedActivationFunctionType::kNone) {
   1543             acc0 = vmaxq_s32(acc0, output_activation_min_vec);
   1544             acc1 = vmaxq_s32(acc1, output_activation_min_vec);
   1545             acc0 = vminq_s32(acc0, output_activation_max_vec);
   1546             acc1 = vminq_s32(acc1, output_activation_max_vec);
   1547           }
   1548           // Saturating cast to uint8 and store to destination.
   1549           const int16x4_t acc0_s16 = vqmovn_s32(acc0);
   1550           const int16x4_t acc1_s16 = vqmovn_s32(acc1);
   1551           const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
   1552           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
   1553           vst1_u8(output_ptr, res_u8);
   1554           output_ptr += 8;
   1555         }
   1556         // Handle 4 values at once. Now we're paying the full price of the
   1557         // high latency of vqrdmulh. Also, storing only 4 bytes at the end
   1558         // (without any alignment) can only be done 1 byte at a time.
   1559         // Yet, that is still worth doing to minimize the amount of leftover
   1560         // that will have to go through the very slow scalar code.
   1561         for (; i <= num_output_values - 4; i += 4) {
   1562           int32x4_t acc = vld1q_s32(acc_buffer + i);
   1563           // Fixed-point multiplication.
   1564           acc = vqrdmulhq_n_s32(acc, output_multiplier);
   1565           // Rounding right shift.
   1566           acc = RoundingDivideByPOT(acc, output_shift);
   1567           // Add the output offset.
   1568           acc = vaddq_s32(acc, output_offset_vec);
   1569           // Apply the activation function.
   1570           if (Ac != FusedActivationFunctionType::kNone) {
   1571             acc = vmaxq_s32(acc, output_activation_min_vec);
   1572             acc = vminq_s32(acc, output_activation_max_vec);
   1573           }
   1574           // Saturating cast to uint8 and store to destination.
   1575           const int16x4_t acc_s16 = vqmovn_s32(acc);
   1576           const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
   1577           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
   1578           vst1_lane_u8(output_ptr + 0, res_u8, 0);
   1579           vst1_lane_u8(output_ptr + 1, res_u8, 1);
   1580           vst1_lane_u8(output_ptr + 2, res_u8, 2);
   1581           vst1_lane_u8(output_ptr + 3, res_u8, 3);
   1582           output_ptr += 4;
   1583         }
   1584 #endif  // USE_NEON
   1585 
   1586         // Handle leftover values, one by one. This is very slow.
   1587         for (; i < num_output_values; i++) {
   1588           int32 acc = acc_buffer[i];
   1589           acc = MultiplyByQuantizedMultiplierSmallerThanOne(
   1590               acc, output_multiplier, output_shift);
   1591           acc += output_offset;
   1592           acc = std::max(acc, output_activation_min);
   1593           acc = std::min(acc, output_activation_max);
   1594           *output_ptr++ = static_cast<uint8>(acc);
   1595         }
   1596       }
   1597     }
   1598   }
   1599 }
   1600 
   1601 }  // namespace optimized_ops
   1602 }  // namespace nn
   1603 }  // namespace android
   1604 
   1605 #endif  // ANDROID_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
   1606