Home | History | Annotate | Download | only in optimized
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
     16 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
     17 
     18 #include "public/gemmlowp.h"
     19 #include "tensorflow/contrib/lite/kernels/internal/common.h"
     20 #include "tensorflow/contrib/lite/kernels/internal/types.h"
     21 
     22 namespace tflite {
     23 namespace optimized_ops {
     24 
     25 // Implementation of float DepthwiseConv
     26 
     27 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
     28 struct FloatDepthwiseConvKernel {};
     29 
     30 #ifdef USE_NEON
     31 
     32 template <>
     33 struct FloatDepthwiseConvKernel<false, 8, 1> {
     34   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
     35                   const float* input_ptr, int input_ptr_increment,
     36                   const float* filter_ptr, float* acc_buffer_ptr) {
     37     // Load the filters
     38     float32x4_t filter[2];
     39     for (int i = 0; i < 2; i++) {
     40       filter[i] = vld1q_f32(filter_ptr + 4 * i);
     41     }
     42     int outp = 0;
     43     // Handle 2 output pixels at a time.
     44     for (; outp <= num_output_pixels - 2; outp += 2) {
     45       // Load the inputs
     46       float32x4_t input[4];
     47       for (int i = 0; i < 4; i++) {
     48         input[i] = vld1q_f32(input_ptr + 4 * i);
     49       }
     50       input_ptr += 16;
     51       // Load the accumulators from acc_buffer
     52       float32x4_t acc[4];
     53       for (int i = 0; i < 4; i++) {
     54         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
     55       }
     56       // Multiply-accumulate
     57       acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
     58       acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
     59       acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
     60       acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
     61       // Store the accumulators back to acc_buffer
     62       for (int i = 0; i < 4; i++) {
     63         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
     64       }
     65       acc_buffer_ptr += 16;
     66     }
     67     // Handle one output pixel at a time.
     68     for (; outp < num_output_pixels; outp++) {
     69       // Load the inputs
     70       float32x4_t input[2];
     71       for (int i = 0; i < 2; i++) {
     72         input[i] = vld1q_f32(input_ptr + 4 * i);
     73       }
     74       input_ptr += 8;
     75       // Load the accumulators from acc_buffer
     76       float32x4_t acc[2];
     77       for (int i = 0; i < 2; i++) {
     78         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
     79       }
     80       // Multiply-accumulate
     81       for (int i = 0; i < 2; i++) {
     82         acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
     83       }
     84       // Store the accumulators back to acc_buffer
     85       for (int i = 0; i < 2; i++) {
     86         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
     87       }
     88       acc_buffer_ptr += 8;
     89     }
     90   }
     91 };
     92 
     93 template <>
     94 struct FloatDepthwiseConvKernel<false, 2, 1> {
     95   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
     96                   const float* input_ptr, int input_ptr_increment,
     97                   const float* filter_ptr, float* acc_buffer_ptr) {
     98     const float32x2_t filters = vld1_f32(filter_ptr);
     99     const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
    100     int outp = 0;
    101     // Handle 8 output pixels at a time.
    102     for (; outp <= num_output_pixels - 8; outp += 8) {
    103       // Load the inputs
    104       float32x4_t input[4];
    105       for (int i = 0; i < 4; i++) {
    106         input[i] = vld1q_f32(input_ptr + 4 * i);
    107       }
    108       input_ptr += 16;
    109       // Load the accumulators from acc_buffer
    110       float32x4_t acc[4];
    111       for (int i = 0; i < 4; i++) {
    112         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    113       }
    114       // Multiply-accumulate
    115       for (int i = 0; i < 4; i++) {
    116         acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
    117       }
    118       // Store the accumulators back to acc_buffer
    119       for (int i = 0; i < 4; i++) {
    120         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    121       }
    122       acc_buffer_ptr += 16;
    123     }
    124     // Handle 4 output pixels at a time.
    125     for (; outp <= num_output_pixels - 4; outp += 4) {
    126       // Load the inputs
    127       float32x4_t input[2];
    128       for (int i = 0; i < 2; i++) {
    129         input[i] = vld1q_f32(input_ptr + 4 * i);
    130       }
    131       input_ptr += 8;
    132       // Load the accumulators from acc_buffer
    133       float32x4_t acc[2];
    134       for (int i = 0; i < 2; i++) {
    135         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    136       }
    137       // Multiply-accumulate
    138       for (int i = 0; i < 2; i++) {
    139         acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
    140       }
    141       // Store the accumulators back to acc_buffer
    142       for (int i = 0; i < 2; i++) {
    143         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    144       }
    145       acc_buffer_ptr += 8;
    146     }
    147     // Handle 2 output pixels at a time.
    148     for (; outp <= num_output_pixels - 2; outp += 2) {
    149       // Load the inputs
    150       const float32x4_t input = vld1q_f32(input_ptr);
    151       input_ptr += 4;
    152       // Load the accumulators from acc_buffer
    153       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
    154       // Multiply-accumulate
    155       acc = vmlaq_f32(acc, input, filters_dup2);
    156       // Store the accumulators back to acc_buffer
    157       vst1q_f32(acc_buffer_ptr, acc);
    158       acc_buffer_ptr += 4;
    159     }
    160     // Handle 1 output pixel at a time
    161     for (; outp < num_output_pixels; outp++) {
    162       // Load the inputs
    163       const float32x2_t input = vld1_f32(input_ptr);
    164       input_ptr += 2;
    165       // Load the accumulators from acc_buffer
    166       float32x2_t acc = vld1_f32(acc_buffer_ptr);
    167       // Multiply-accumulate
    168       acc = vmla_f32(acc, input, filters);
    169       // Store the accumulators back to acc_buffer
    170       vst1_f32(acc_buffer_ptr, acc);
    171       acc_buffer_ptr += 2;
    172     }
    173   }
    174 };
    175 
    176 template <>
    177 struct FloatDepthwiseConvKernel<true, 0, 1> {
    178   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    179                   const float* input_ptr, int input_ptr_increment,
    180                   const float* filter_ptr, float* acc_buffer_ptr) {
    181     // Handle one output pixel at a time.
    182     for (int outp = 0; outp < num_output_pixels; outp++) {
    183       const float* local_filter_ptr = filter_ptr;
    184       const float* local_input_ptr = input_ptr;
    185       int ic = 0;
    186       // Handle 16 input channels at a time.
    187       for (; ic <= input_depth - 16; ic += 16) {
    188         // Load the filters
    189         float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
    190         float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
    191         float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
    192         float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
    193         local_filter_ptr += 16;
    194         // Load the inputs
    195         float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
    196         float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
    197         float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
    198         float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
    199         local_input_ptr += 16;
    200         // Load the accumulators from acc_buffer
    201         float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
    202         float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
    203         float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
    204         float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
    205         // Multiply-accumulate
    206         acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
    207         acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
    208         acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
    209         acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
    210         // Store the accumulators back to acc_buffer
    211         vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
    212         vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
    213         vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
    214         vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
    215         acc_buffer_ptr += 16;
    216       }
    217       // Handle 4 input channels at a time.
    218       for (; ic <= input_depth - 4; ic += 4) {
    219         // Load the filters
    220         float32x4_t filter;
    221         filter = vld1q_f32(local_filter_ptr);
    222         local_filter_ptr += 4;
    223         // Load the inputs
    224         float32x4_t input;
    225         input = vld1q_f32(local_input_ptr);
    226         local_input_ptr += 4;
    227         // Load the accumulators from acc_buffer
    228         float32x4_t acc;
    229         acc = vld1q_f32(acc_buffer_ptr);
    230         // Multiply-accumulate
    231         acc = vmlaq_f32(acc, input, filter);
    232         // Store the accumulators back to acc_buffer
    233         vst1q_f32(acc_buffer_ptr, acc);
    234         acc_buffer_ptr += 4;
    235       }
    236       // Handle one input channel at a time.
    237       for (; ic < input_depth; ic++) {
    238         const float input_val = *local_input_ptr++;
    239         const float filter_val = *local_filter_ptr++;
    240         *acc_buffer_ptr++ += filter_val * input_val;
    241       }
    242       input_ptr += input_ptr_increment;
    243     }
    244   }
    245 };
    246 
    247 template <>
    248 struct FloatDepthwiseConvKernel<true, 0, 8> {
    249   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    250                   const float* input_ptr, int input_ptr_increment,
    251                   const float* filter_ptr, float* acc_buffer_ptr) {
    252     // Handle one output pixel at a time.
    253     for (int outp = 0; outp < num_output_pixels; outp++) {
    254       const float* local_filter_ptr = filter_ptr;
    255       const float* local_input_ptr = input_ptr;
    256       int ic = 0;
    257       // Handle 2 input channels at a time.
    258       for (; ic <= input_depth - 2; ic += 2) {
    259         // Load the filters
    260         float32x4_t filter[4];
    261         for (int i = 0; i < 4; i++) {
    262           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
    263         }
    264         local_filter_ptr += 16;
    265         // Load the inputs
    266         const float32x2_t input = vld1_f32(local_input_ptr);
    267         local_input_ptr += 2;
    268         // Load the accumulators from acc_buffer
    269         float32x4_t acc[4];
    270         for (int i = 0; i < 4; i++) {
    271           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    272         }
    273         // Multiply-accumulate
    274         acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
    275         acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
    276         acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
    277         acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
    278         // Store the accumulators back to acc_buffer
    279         for (int i = 0; i < 4; i++) {
    280           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    281         }
    282         acc_buffer_ptr += 16;
    283       }
    284       // Handle one input channel at a time.
    285       for (; ic < input_depth; ic++) {
    286         // Load the filters
    287         float32x4_t filter[2];
    288         for (int i = 0; i < 2; i++) {
    289           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
    290         }
    291         local_filter_ptr += 8;
    292         // Load the inputs
    293         const float input_val = *local_input_ptr++;
    294         // Load the accumulators from acc_buffer
    295         float32x4_t acc[2];
    296         for (int i = 0; i < 2; i++) {
    297           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    298         }
    299         // Multiply-accumulate
    300         for (int i = 0; i < 2; i++) {
    301           acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
    302         }
    303         // Store the accumulators back to acc_buffer
    304         for (int i = 0; i < 2; i++) {
    305           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    306         }
    307         acc_buffer_ptr += 8;
    308       }
    309       input_ptr += input_ptr_increment;
    310     }
    311   }
    312 };
    313 
    314 // Note this implementation is very slow for input_depths < 8
    315 // (e.g. comparable to reference implementation) see, specializations for
    316 // input_depth=3 below.
    317 template <>
    318 struct FloatDepthwiseConvKernel<true, 0, 2> {
    319   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    320                   const float* input_ptr, int input_ptr_increment,
    321                   const float* filter_ptr, float* acc_buffer_ptr) {
    322     // Handle one output pixel at a time.
    323     for (int outp = 0; outp < num_output_pixels; outp++) {
    324       const float* local_filter_ptr = filter_ptr;
    325       const float* local_input_ptr = input_ptr;
    326       int ic = 0;
    327       // Handle 8 input channels at a time.
    328       for (; ic <= input_depth - 8; ic += 8) {
    329         // Load the filters
    330         float32x4_t filter[4];
    331         for (int i = 0; i < 4; i++) {
    332           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
    333         }
    334         local_filter_ptr += 16;
    335         // Load the inputs
    336         float32x4x2_t input_dup2[2];
    337         for (int i = 0; i < 2; i++) {
    338           const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
    339           input_dup2[i] = vzipq_f32(input, input);
    340         }
    341         local_input_ptr += 8;
    342         // Load the accumulators from acc_buffer
    343         float32x4_t acc[4];
    344         for (int i = 0; i < 4; i++) {
    345           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    346         }
    347         // Multiply-accumulate
    348         acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
    349         acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
    350         acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
    351         acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
    352         // Store the accumulators back to acc_buffer
    353         for (int i = 0; i < 4; i++) {
    354           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    355         }
    356         acc_buffer_ptr += 16;
    357       }
    358       // Handle 4 input channels at a time.
    359       for (; ic <= input_depth - 4; ic += 4) {
    360         // Load the filters
    361         float32x2_t filter[4];
    362         for (int i = 0; i < 4; i++) {
    363           filter[i] = vld1_f32(local_filter_ptr + 2 * i);
    364         }
    365         local_filter_ptr += 8;
    366         // Load the inputs
    367         const float32x4_t input = vld1q_f32(local_input_ptr);
    368         local_input_ptr += 4;
    369         // Load the accumulators from acc_buffer
    370         float32x2_t acc[4];
    371         for (int i = 0; i < 4; i++) {
    372           acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
    373         }
    374         // Multiply-accumulate
    375         acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
    376         acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
    377         acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
    378         acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
    379         // Store the accumulators back to acc_buffer
    380         for (int i = 0; i < 4; i++) {
    381           vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
    382         }
    383         acc_buffer_ptr += 8;
    384       }
    385       // Handle 2 input channels at a time.
    386       for (; ic <= input_depth - 2; ic += 2) {
    387         // Load the filters
    388         const float32x4_t filter = vld1q_f32(local_filter_ptr);
    389         local_filter_ptr += 4;
    390         // Load the inputs
    391         const float32x2_t input = vld1_f32(local_input_ptr);
    392         local_input_ptr += 2;
    393         // Load the accumulators from acc_buffer
    394         float32x2_t acc[2];
    395         for (int i = 0; i < 2; i++) {
    396           acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
    397         }
    398         // Multiply-accumulate
    399         acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
    400         acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
    401         // Store the accumulators back to acc_buffer
    402         for (int i = 0; i < 2; i++) {
    403           vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
    404         }
    405         acc_buffer_ptr += 4;
    406       }
    407       // Handle one input channel at a time.
    408       for (; ic < input_depth; ic++) {
    409         // Load the inputs
    410         const float input_val = *local_input_ptr++;
    411         // Multiply-accumulate
    412         for (int i = 0; i < 2; i++) {
    413           acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
    414         }
    415         local_filter_ptr += 2;
    416         acc_buffer_ptr += 2;
    417       }
    418       input_ptr += input_ptr_increment;
    419     }
    420   }
    421 };
    422 
    423 template <>
    424 struct FloatDepthwiseConvKernel<true, 3, 2> {
    425   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    426                   const float* input_ptr, int input_ptr_increment,
    427                   const float* filter_ptr, float* acc_buffer_ptr) {
    428     // Load the filters
    429     float32x2_t filter[3];
    430     for (int i = 0; i < 3; i++) {
    431       filter[i] = vld1_f32(filter_ptr + 2 * i);
    432     }
    433     // Handle one output pixel at a time.
    434     for (int outp = 0; outp < num_output_pixels; outp++) {
    435       const float32x2_t input01 = vld1_f32(input_ptr);
    436       const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
    437       // Load the accumulators from acc_buffer
    438       float32x2_t acc[3];
    439       for (int i = 0; i < 3; i++) {
    440         acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
    441       }
    442       // Multiply-accumulate for each input channel there 2 outputs
    443       acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
    444       acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
    445       acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
    446       // Store the accumulators back to acc_buffer
    447       for (int i = 0; i < 3; i++) {
    448         vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
    449       }
    450       acc_buffer_ptr += 6;
    451       input_ptr += input_ptr_increment;
    452     }
    453   }
    454 };
    455 
    456 template <>
    457 struct FloatDepthwiseConvKernel<true, 3, 4> {
    458   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    459                   const float* input_ptr, int input_ptr_increment,
    460                   const float* filter_ptr, float* acc_buffer_ptr) {
    461     // Load the filters
    462     float32x4_t filter[3];
    463     for (int i = 0; i < 3; i++) {
    464       filter[i] = vld1q_f32(filter_ptr + 4 * i);
    465     }
    466     // Handle one output pixel at a time.
    467     for (int outp = 0; outp < num_output_pixels; outp++) {
    468       // NOTE: we only want 3 values, so we read it as two ops where
    469       // the second op just duplicates the lane
    470       const float32x2_t input01 = vld1_f32(input_ptr);
    471       const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
    472       // Load the accumulators from acc_buffer
    473       float32x4_t acc[3];
    474       for (int i = 0; i < 3; i++) {
    475         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    476       }
    477       // Multiply-accumulate all outputs.
    478       acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
    479       acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
    480       acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
    481       // Store the accumulators back to acc_buffer
    482       for (int i = 0; i < 3; i++) {
    483         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    484       }
    485       acc_buffer_ptr += 12;
    486       input_ptr += input_ptr_increment;
    487     }
    488   }
    489 };
    490 
    491 template <>
    492 struct FloatDepthwiseConvKernel<true, 1, 8> {
    493   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    494                   const float* input_ptr, int input_ptr_increment,
    495                   const float* filter_ptr, float* acc_buffer_ptr) {
    496     // Load the filters
    497     float32x4_t filter[2];
    498     for (int i = 0; i < 2; i++) {
    499       filter[i] = vld1q_f32(filter_ptr + 4 * i);
    500     }
    501     // Handle one output pixel at a time.
    502     for (int outp = 0; outp < num_output_pixels; outp++) {
    503       // Load the inputs
    504       const float input_val = *input_ptr;
    505       input_ptr += input_ptr_increment;
    506       // Load the accumulators from acc_buffer
    507       float32x4_t acc[2];
    508       for (int i = 0; i < 2; i++) {
    509         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    510       }
    511       // Multiply-accumulate
    512       for (int i = 0; i < 2; i++) {
    513         acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
    514       }
    515       // Store the accumulators back to acc_buffer
    516       for (int i = 0; i < 2; i++) {
    517         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    518       }
    519       acc_buffer_ptr += 8;
    520     }
    521   }
    522 };
    523 
    524 template <>
    525 struct FloatDepthwiseConvKernel<true, 1, 32> {
    526   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    527                   const float* input_ptr, int input_ptr_increment,
    528                   const float* filter_ptr, float* acc_buffer_ptr) {
    529     // Load the filters
    530     float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
    531     float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
    532     float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
    533     float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
    534     float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
    535     float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
    536     float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
    537     float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
    538 
    539     // Handle one output pixel at a time.
    540     for (int outp = 0; outp < num_output_pixels; outp++) {
    541       // Load the inputs
    542       const float input_val = *input_ptr;
    543       input_ptr += input_ptr_increment;
    544       // Load the accumulators from acc_buffer
    545       float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
    546       float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
    547       float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
    548       float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
    549       float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
    550       float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
    551       float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
    552       float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
    553       // Multiply-accumulate
    554       acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
    555       acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
    556       acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
    557       acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
    558       acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
    559       acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
    560       acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
    561       acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
    562       // Store the accumulators back to acc_buffer
    563       vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
    564       vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
    565       vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
    566       vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
    567       vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
    568       vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
    569       vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
    570       vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
    571       acc_buffer_ptr += 32;
    572     }
    573   }
    574 };
    575 
    576 template <>
    577 struct FloatDepthwiseConvKernel<true, 1, 20> {
    578   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    579                   const float* input_ptr, int input_ptr_increment,
    580                   const float* filter_ptr, float* acc_buffer_ptr) {
    581     // Load the filters
    582     float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
    583     float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
    584     float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
    585     float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
    586     float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
    587 
    588     // Handle one output pixel at a time.
    589     for (int outp = 0; outp < num_output_pixels; outp++) {
    590       // Load the inputs
    591       const float input_val = *input_ptr;
    592       input_ptr += input_ptr_increment;
    593       // Load the accumulators from acc_buffer
    594       float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
    595       float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
    596       float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
    597       float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
    598       float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
    599       // Multiply-accumulate
    600       acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
    601       acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
    602       acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
    603       acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
    604       acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
    605       // Store the accumulators back to acc_buffer
    606       vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
    607       vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
    608       vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
    609       vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
    610       vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
    611       acc_buffer_ptr += 20;
    612     }
    613   }
    614 };
    615 
    616 template <>
    617 struct FloatDepthwiseConvKernel<true, 0, 16> {
    618   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    619                   const float* input_ptr, int input_ptr_increment,
    620                   const float* filter_ptr, float* acc_buffer_ptr) {
    621     // Handle one output pixel at a time.
    622     for (int outp = 0; outp < num_output_pixels; outp++) {
    623       const float* local_filter_ptr = filter_ptr;
    624       const float* local_input_ptr = input_ptr;
    625       for (int ic = 0; ic < input_depth; ic++) {
    626         // Load the filters
    627         float32x4_t filter[4];
    628         for (int i = 0; i < 4; i++) {
    629           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
    630         }
    631         local_filter_ptr += 16;
    632         // Load the inputs
    633         const float input_val = *local_input_ptr++;
    634         // Load the accumulators from acc_buffer
    635         float32x4_t acc[4];
    636         for (int i = 0; i < 4; i++) {
    637           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    638         }
    639         // Multiply-accumulate
    640         for (int i = 0; i < 4; i++) {
    641           acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
    642         }
    643         // Store the accumulators back to acc_buffer
    644         for (int i = 0; i < 4; i++) {
    645           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    646         }
    647         acc_buffer_ptr += 16;
    648       }
    649       input_ptr += input_ptr_increment;
    650     }
    651   }
    652 };
    653 
    654 template <>
    655 struct FloatDepthwiseConvKernel<true, 8, 1> {
    656   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    657                   const float* input_ptr, int input_ptr_increment,
    658                   const float* filter_ptr, float* acc_buffer_ptr) {
    659     // Load the filters
    660     float32x4_t filter[2];
    661     for (int i = 0; i < 2; i++) {
    662       filter[i] = vld1q_f32(filter_ptr + 4 * i);
    663     }
    664     // Handle one output pixel at a time.
    665     for (int outp = 0; outp < num_output_pixels; outp++) {
    666       // Load the inputs
    667       float32x4_t input[2];
    668       for (int i = 0; i < 2; i++) {
    669         input[i] = vld1q_f32(input_ptr + 4 * i);
    670       }
    671       // Load the accumulators from acc_buffer
    672       float32x4_t acc[2];
    673       for (int i = 0; i < 2; i++) {
    674         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
    675       }
    676       // Multiply-accumulate
    677       for (int i = 0; i < 2; i++) {
    678         acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
    679       }
    680       // Store the accumulators back to acc_buffer
    681       for (int i = 0; i < 2; i++) {
    682         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
    683       }
    684       acc_buffer_ptr += 8;
    685       input_ptr += input_ptr_increment;
    686     }
    687   }
    688 };
    689 
    690 template <>
    691 struct FloatDepthwiseConvKernel<true, 2, 1> {
    692   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    693                   const float* input_ptr, int input_ptr_increment,
    694                   const float* filter_ptr, float* acc_buffer_ptr) {
    695     float32x2_t filter = vld1_f32(filter_ptr);
    696     float32x4_t filter_x4 = vcombine_f32(filter, filter);
    697     int outp = 0;
    698 
    699     // Handle two output pixels at a time.
    700     for (; outp <= num_output_pixels - 2; outp += 2) {
    701       // Load the inputs
    702       float32x2_t input_1 = vld1_f32(input_ptr);
    703       input_ptr += input_ptr_increment;
    704       float32x2_t input_2 = vld1_f32(input_ptr);
    705       input_ptr += input_ptr_increment;
    706       float32x4_t input = vcombine_f32(input_1, input_2);
    707 
    708       // Load the accumulators from acc_buffer
    709       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
    710 
    711       // Multiply-accumulate
    712       acc = vmlaq_f32(acc, input, filter_x4);
    713 
    714       // Store the accumulators back to acc_buffer
    715       vst1q_f32(acc_buffer_ptr, acc);
    716       acc_buffer_ptr += 4;
    717     }
    718     // Handle one output pixel at a time.
    719     for (; outp < num_output_pixels; outp++) {
    720       // Load the inputs
    721       float32x2_t input = vld1_f32(input_ptr);
    722       input_ptr += input_ptr_increment;
    723 
    724       // Load the accumulators from acc_buffer
    725       float32x2_t acc = vld1_f32(acc_buffer_ptr);
    726 
    727       // Multiply-accumulate
    728       acc = vmla_f32(acc, input, filter);
    729 
    730       // Store the accumulators back to acc_buffer
    731       vst1_f32(acc_buffer_ptr, acc);
    732       acc_buffer_ptr += 2;
    733     }
    734   }
    735 };
    736 
    737 template <>
    738 struct FloatDepthwiseConvKernel<true, 4, 1> {
    739   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
    740                   const float* input_ptr, int input_ptr_increment,
    741                   const float* filter_ptr, float* acc_buffer_ptr) {
    742     float32x4_t filter = vld1q_f32(filter_ptr);
    743 
    744     // Handle one output pixel at a time.
    745     for (int outp = 0; outp < num_output_pixels; outp++) {
    746       // Load the inputs
    747       float32x4_t input = vld1q_f32(input_ptr);
    748       // Load the accumulators from acc_buffer
    749       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
    750       // Multiply-accumulate
    751       acc = vmlaq_f32(acc, input, filter);
    752       // Store the accumulators back to acc_buffer
    753       vst1q_f32(acc_buffer_ptr, acc);
    754       acc_buffer_ptr += 4;
    755       input_ptr += input_ptr_increment;
    756     }
    757   }
    758 };
    759 #endif
    760 
    761 // Accumulates the effect of one row of the filter, on a segment of one row
    762 // of the output, accessing the corresponding one row of the input.
    763 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
    764 void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
    765                                 const float* input_data, int pad_width,
    766                                 int depth_multiplier, int filter_width,
    767                                 const float* filter_data,
    768                                 int out_x_buffer_start, int out_x_buffer_end,
    769                                 int output_depth, float* acc_buffer) {
    770 #ifdef GEMMLOWP_PROFILING
    771   gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
    772 #endif
    773   // Sanity check parameters. This is important in particular to ensure
    774   // that we keep the number of template instantiations minimal, so we don't
    775   // increase binary size unnecessarily.
    776   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
    777   static_assert(kFixedInputDepth || kAllowStrided, "");
    778   TFLITE_DCHECK(stride == 1 || kAllowStrided);
    779   if (kFixedInputDepth) {
    780     TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
    781   }
    782   if (kFixedDepthMultiplier) {
    783     TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
    784   }
    785   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
    786   const int input_ptr_increment = stride * input_depth;
    787   const float* filter_base_ptr = filter_data;
    788   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
    789     // For the current (filter_x, filter_y) point in the filter,
    790     // compute the boundaries of the corresponding output row segment.
    791     int out_x_loop_start_unclampled = 0;
    792     int out_x_loop_end_unclampled = 0;
    793     if (kAllowStrided) {
    794       if (stride == 2) {
    795         out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
    796         out_x_loop_end_unclampled =
    797             (pad_width + input_width - filter_x + 1) / 2;
    798       } else if (stride == 4) {
    799         out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
    800         out_x_loop_end_unclampled =
    801             (pad_width + input_width - filter_x + 3) / 4;
    802       } else {
    803         out_x_loop_start_unclampled =
    804             (pad_width - filter_x + stride - 1) / stride;
    805         out_x_loop_end_unclampled =
    806             (pad_width + input_width - filter_x + stride - 1) / stride;
    807       }
    808     } else {
    809       out_x_loop_start_unclampled = pad_width - filter_x;
    810       out_x_loop_end_unclampled = pad_width + input_width - filter_x;
    811     }
    812     // The kernel will have to iterate on the segment of the
    813     // output row that starts at out_x_loop_start and out_x_loop_end.
    814     const int out_x_loop_start =
    815         std::max(out_x_buffer_start, out_x_loop_start_unclampled);
    816     const int out_x_loop_end =
    817         std::min(out_x_buffer_end, out_x_loop_end_unclampled);
    818 
    819     float* acc_buffer_ptr =
    820         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
    821     const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
    822     const float* input_ptr = input_data + in_x_origin * input_depth;
    823     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
    824     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
    825                              kFixedDepthMultiplier>::Run(num_output_pixels,
    826                                                          input_depth,
    827                                                          depth_multiplier,
    828                                                          input_ptr,
    829                                                          input_ptr_increment,
    830                                                          filter_base_ptr,
    831                                                          acc_buffer_ptr);
    832     filter_base_ptr += output_depth;
    833   }
    834 }
    835 
    836 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
    837 inline void FloatDepthwiseConvAccumRowGeneric(
    838     int stride, int input_depth, int input_width, const float* input_data,
    839     int pad_width, int depth_multiplier, int filter_width,
    840     const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
    841     int output_depth, float* acc_buffer) {
    842   gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
    843 #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
    844 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
    845   LOG(FATAL)
    846       << "\n\n"
    847       << "*****************************************************************\n"
    848       << "* This tfmini inference code was about to use the slow generic\n"
    849       << "* fallback implementation for a DepthwiseConv op, and we want you\n"
    850       << "* to be aware of that so that you will know why you get terrible\n"
    851       << "* performance.\n"
    852       << "*\n"
    853       << "* If you would like to carry on with the slow code, compile\n"
    854       << "* with this preprocessor token defined:\n"
    855       << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
    856       << "*\n"
    857       << "* The right thing to do, if you care about performance, is to add\n"
    858       << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
    859       << "* The relevant parameters defining your case are:\n"
    860       << "* stride = " << stride << "\n"
    861       << "* input_depth = " << input_depth << "\n"
    862       << "* depth_multiplier = " << depth_multiplier << "\n"
    863       << "*\n"
    864       << "* Please do not hesitate to contact benoitjacob@ with this\n"
    865       << "* information.\n"
    866       << "*****************************************************************\n";
    867 #endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
    868 #endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
    869   const float* filter_base_ptr = filter_data;
    870   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
    871     const int out_x_loop_start = std::max(
    872         out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
    873     const int out_x_loop_end =
    874         std::min(out_x_buffer_end,
    875                  (pad_width + input_width - filter_x + stride - 1) / stride);
    876 
    877     float* acc_buffer_ptr =
    878         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
    879     const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
    880     const float* input_ptr = input_data + in_x_origin * input_depth;
    881     const int input_ptr_increment = (stride - 1) * input_depth;
    882     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
    883       const float* filter_ptr = filter_base_ptr;
    884       for (int ic = 0; ic < input_depth; ++ic) {
    885         const float input_val = *input_ptr++;
    886         for (int m = 0; m < depth_multiplier; m++) {
    887           const float filter_val = *filter_ptr++;
    888           *acc_buffer_ptr++ += filter_val * input_val;
    889         }
    890       }
    891       input_ptr += input_ptr_increment;
    892     }
    893     filter_base_ptr += output_depth;
    894   }
    895 }
    896 
    897 // Initializes the accumulator buffer with bias values.
    898 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
    899                                        const float* bias_data,
    900                                        float* acc_buffer) {
    901   // TODO(benoitjacob): This might need optimized specializations
    902   // for small output_depth values, if that ever becomes an important
    903   // case (like it was for some quantized DepthwiseConv cases).
    904   for (int i = 0; i < num_output_pixels; i++) {
    905     memcpy(acc_buffer + i * output_depth, bias_data,
    906            sizeof(acc_buffer[0]) * output_depth);
    907   }
    908 }
    909 
    910 inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
    911                           const float* filter_data, const Dims<4>& filter_dims,
    912                           const float* bias_data, const Dims<4>& bias_dims,
    913                           int stride_width, int stride_height, int pad_width,
    914                           int pad_height, int depth_multiplier,
    915                           float output_activation_min,
    916                           float output_activation_max, float* output_data,
    917                           const Dims<4>& output_dims) {
    918   gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
    919   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
    920   const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
    921   const int input_height = ArraySize(input_dims, 2);
    922   const int input_width = ArraySize(input_dims, 1);
    923   const int input_depth = ArraySize(input_dims, 0);
    924   const int filter_height = ArraySize(filter_dims, 2);
    925   const int filter_width = ArraySize(filter_dims, 1);
    926   const int output_height = ArraySize(output_dims, 2);
    927   const int output_width = ArraySize(output_dims, 1);
    928   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
    929 
    930   static const int kAccBufferMaxSize = 2048;
    931   float acc_buffer[kAccBufferMaxSize];
    932   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
    933   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
    934   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
    935   TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
    936                    kAccBufferActualSize);
    937   TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
    938   TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
    939 
    940   // row_accum_func will point to the core accumulation function to be used
    941   // for this DepthwiseConv op.
    942   using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
    943   row_accum_func_t row_accum_func = nullptr;
    944 
    945 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
    946                                         FIXED_DEPTH_MULTIPLIER)           \
    947   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
    948       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
    949       depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
    950     row_accum_func =                                                      \
    951         FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
    952                                    FIXED_DEPTH_MULTIPLIER>;               \
    953   }
    954 
    955 #ifdef USE_NEON
    956   // We go over our list of kernels by decreasing order of preference
    957   // for the cases where multiple kernels could apply.
    958 
    959   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
    960 
    961   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
    962   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
    963 
    964   // Next come the strided kernels: AllowStrided=true, fixed input depth.
    965   // They are a bit less efficient, but allow stride!=1.
    966 
    967   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
    968   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
    969   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
    970   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
    971   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
    972   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
    973   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
    974   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
    975 
    976   // Finally, the kernels allowing a variable input depth,
    977   // these are the least efficient but most general kernels.
    978 
    979   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
    980   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
    981   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
    982   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
    983 
    984 #endif  // USE_NEON
    985 
    986 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
    987 
    988   // No matching fast kernel found, use slow fallback.
    989   if (!row_accum_func) {
    990     row_accum_func = FloatDepthwiseConvAccumRowGeneric;
    991   }
    992 
    993   // Now that we have determined row_accum_func, we can start work.
    994   float* output_ptr = output_data;
    995   for (int b = 0; b < batches; ++b) {
    996     for (int out_y = 0; out_y < output_height; ++out_y) {
    997       const int in_y_origin = (out_y * stride_height) - pad_height;
    998       const int filter_y_start = std::max(0, -in_y_origin);
    999       const int filter_y_end =
   1000           std::min(filter_height, input_height - in_y_origin);
   1001       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
   1002            out_x_buffer_start += kOutputPixelsInAccBuffer) {
   1003         const int out_x_buffer_end = std::min(
   1004             output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
   1005         // We call a 'pixel' a group of activation that share all but the
   1006         // 'depth'/'channel' coordinate. num_output_pixels is the number of
   1007         // output pixels that we will accumulate in this loop iteration.
   1008         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
   1009         // Initialize our local accumulator with the bias values, so we don't
   1010         // have to add them later.
   1011         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
   1012                                    acc_buffer);
   1013         // Accumulation loop. Most of the time should be spent in here.
   1014         for (int filter_y = filter_y_start; filter_y < filter_y_end;
   1015              ++filter_y) {
   1016           const int in_y = in_y_origin + filter_y;
   1017           row_accum_func(stride_width, input_depth, input_width,
   1018                          input_data + in_y * input_dims.strides[2] +
   1019                              b * input_dims.strides[3],
   1020                          pad_width, depth_multiplier, filter_width,
   1021                          filter_data + filter_y * filter_dims.strides[2],
   1022                          out_x_buffer_start, out_x_buffer_end, output_depth,
   1023                          acc_buffer);
   1024         }
   1025         // Finished accumulating. Now store to destination.
   1026         const int num_output_values = output_depth * num_output_pixels;
   1027         int i = 0;
   1028 // TODO(benoitjacob) optimized code goes here
   1029 #ifdef USE_NEON
   1030         // Handle 16 values at a time
   1031         for (; i <= num_output_values - 16; i += 16) {
   1032           float32x4_t acc[4];
   1033           for (int k = 0; k < 4; k++) {
   1034             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
   1035           }
   1036           for (int k = 0; k < 4; k++) {
   1037             acc[k] = vmaxq_f32(
   1038                 vdupq_n_f32(output_activation_min),
   1039                 vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
   1040           }
   1041           for (int k = 0; k < 4; k++) {
   1042             vst1q_f32(output_ptr + 4 * k, acc[k]);
   1043           }
   1044           output_ptr += 16;
   1045         }
   1046         // Handle 4 values at a time
   1047         for (; i <= num_output_values - 4; i += 4) {
   1048           float32x4_t acc = vld1q_f32(acc_buffer + i);
   1049 
   1050           acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
   1051                           vminq_f32(vdupq_n_f32(output_activation_max), acc));
   1052 
   1053           vst1q_f32(output_ptr, acc);
   1054           output_ptr += 4;
   1055         }
   1056 #endif
   1057         // Handle leftover values, one by one. This is very slow.
   1058         for (; i < num_output_values; i++) {
   1059           float acc = acc_buffer[i];
   1060           acc = std::max(output_activation_min,
   1061                          std::min(output_activation_max, acc));
   1062 
   1063           *output_ptr++ = acc;
   1064         }
   1065       }
   1066     }
   1067   }
   1068 }
   1069 
   1070 // legacy, for compatibility with old checked-in code
   1071 template <FusedActivationFunctionType Ac>
   1072 void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
   1073                    const float* filter_data, const Dims<4>& filter_dims,
   1074                    const float* bias_data, const Dims<4>& bias_dims,
   1075                    int stride_width, int stride_height, int pad_width,
   1076                    int pad_height, int depth_multiplier, float* output_data,
   1077                    const Dims<4>& output_dims) {
   1078   float output_activation_min, output_activation_max;
   1079   GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
   1080   DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
   1081                 bias_dims, stride_width, stride_height, pad_width, pad_height,
   1082                 depth_multiplier, output_activation_min, output_activation_max,
   1083                 output_data, output_dims);
   1084 }
   1085 
   1086 // legacy, for compatibility with old checked-in code
   1087 template <FusedActivationFunctionType Ac>
   1088 void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
   1089                    const float* filter_data, const Dims<4>& filter_dims,
   1090                    const float* bias_data, const Dims<4>& bias_dims, int stride,
   1091                    int pad_width, int pad_height, int depth_multiplier,
   1092                    float* output_data, const Dims<4>& output_dims) {
   1093   DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
   1094                     bias_dims, stride, stride, pad_width, pad_height,
   1095                     depth_multiplier, output_data, output_dims);
   1096 }
   1097 
   1098 }  // namespace optimized_ops
   1099 }  // namespace tflite
   1100 
   1101 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
   1102