Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include <assert.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vpx_dsp_rtcd.h"
     16 #include "vpx/vpx_integer.h"
     17 #include "vpx_dsp/arm/transpose_neon.h"
     18 #include "vpx_ports/mem.h"
     19 
     20 // Note:
     21 // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
     22 // 2. After refactoring the shared code in kernel loops with inline functions,
     23 // the decoder speed dropped a lot when using gcc compiler. Therefore there is
     24 // no refactoring for those parts by now.
     25 // 3. For horizontal convolve, there is an alternative optimization that
     26 // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
     27 // samples in each are read from memory: src, (src+1), (src+2), (src+3),
     28 // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
     29 // instructions. This optimization is much faster in speed unit test, but slowed
     30 // down the whole decoder by 5%.
     31 
     32 static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
     33                             uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) {
     34   *s0 = vld1_u8(s);
     35   s += p;
     36   *s1 = vld1_u8(s);
     37   s += p;
     38   *s2 = vld1_u8(s);
     39   s += p;
     40   *s3 = vld1_u8(s);
     41 }
     42 
     43 static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
     44                             uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3,
     45                             uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6,
     46                             uint8x8_t *s7) {
     47   *s0 = vld1_u8(s);
     48   s += p;
     49   *s1 = vld1_u8(s);
     50   s += p;
     51   *s2 = vld1_u8(s);
     52   s += p;
     53   *s3 = vld1_u8(s);
     54   s += p;
     55   *s4 = vld1_u8(s);
     56   s += p;
     57   *s5 = vld1_u8(s);
     58   s += p;
     59   *s6 = vld1_u8(s);
     60   s += p;
     61   *s7 = vld1_u8(s);
     62 }
     63 
     64 static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
     65                              const uint8x8_t s1, const uint8x8_t s2,
     66                              const uint8x8_t s3, const uint8x8_t s4,
     67                              const uint8x8_t s5, const uint8x8_t s6,
     68                              const uint8x8_t s7) {
     69   vst1_u8(s, s0);
     70   s += p;
     71   vst1_u8(s, s1);
     72   s += p;
     73   vst1_u8(s, s2);
     74   s += p;
     75   vst1_u8(s, s3);
     76   s += p;
     77   vst1_u8(s, s4);
     78   s += p;
     79   vst1_u8(s, s5);
     80   s += p;
     81   vst1_u8(s, s6);
     82   s += p;
     83   vst1_u8(s, s7);
     84 }
     85 
     86 static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2,
     87                                     int16x4_t s3, int16x4_t s4, int16x4_t s5,
     88                                     int16x4_t s6, int16x4_t s7,
     89                                     int16x8_t filters, int16x4_t filter3,
     90                                     int16x4_t filter4) {
     91   const int16x4_t filters_lo = vget_low_s16(filters);
     92   const int16x4_t filters_hi = vget_high_s16(filters);
     93   int16x4_t sum = vdup_n_s16(0);
     94 
     95   sum = vmla_lane_s16(sum, s0, filters_lo, 0);
     96   sum = vmla_lane_s16(sum, s1, filters_lo, 1);
     97   sum = vmla_lane_s16(sum, s2, filters_lo, 2);
     98   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
     99   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
    100   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
    101   sum = vqadd_s16(sum, vmul_s16(s3, filter3));
    102   sum = vqadd_s16(sum, vmul_s16(s4, filter4));
    103   return sum;
    104 }
    105 
    106 static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2,
    107                                     int16x8_t s3, int16x8_t s4, int16x8_t s5,
    108                                     int16x8_t s6, int16x8_t s7,
    109                                     int16x8_t filters, int16x8_t filter3,
    110                                     int16x8_t filter4) {
    111   const int16x4_t filters_lo = vget_low_s16(filters);
    112   const int16x4_t filters_hi = vget_high_s16(filters);
    113   int16x8_t sum = vdupq_n_s16(0);
    114 
    115   sum = vmlaq_lane_s16(sum, s0, filters_lo, 0);
    116   sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
    117   sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
    118   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
    119   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
    120   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
    121   sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
    122   sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
    123   return sum;
    124 }
    125 
    126 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
    127                               uint8_t *dst, ptrdiff_t dst_stride,
    128                               const int16_t *filter_x, int x_step_q4,
    129                               const int16_t *filter_y,  // unused
    130                               int y_step_q4,            // unused
    131                               int w, int h) {
    132   const int16x8_t filters = vld1q_s16(filter_x);
    133   uint8x8_t t0, t1, t2, t3;
    134 
    135   assert(!((intptr_t)dst & 3));
    136   assert(!(dst_stride & 3));
    137   assert(x_step_q4 == 16);
    138 
    139   (void)x_step_q4;
    140   (void)y_step_q4;
    141   (void)filter_y;
    142 
    143   src -= 3;
    144 
    145   if (h == 4) {
    146     uint8x8_t d01, d23;
    147     int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
    148         d1, d2, d3;
    149     int16x8_t tt0, tt1, tt2, tt3;
    150 
    151     __builtin_prefetch(src + 0 * src_stride);
    152     __builtin_prefetch(src + 1 * src_stride);
    153     __builtin_prefetch(src + 2 * src_stride);
    154     __builtin_prefetch(src + 3 * src_stride);
    155     filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
    156     filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
    157     load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
    158     transpose_u8_8x4(&t0, &t1, &t2, &t3);
    159     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    160     tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    161     tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    162     tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    163     s0 = vget_low_s16(tt0);
    164     s1 = vget_low_s16(tt1);
    165     s2 = vget_low_s16(tt2);
    166     s3 = vget_low_s16(tt3);
    167     s4 = vget_high_s16(tt0);
    168     s5 = vget_high_s16(tt1);
    169     s6 = vget_high_s16(tt2);
    170     __builtin_prefetch(dst + 0 * dst_stride);
    171     __builtin_prefetch(dst + 1 * dst_stride);
    172     __builtin_prefetch(dst + 2 * dst_stride);
    173     __builtin_prefetch(dst + 3 * dst_stride);
    174     src += 7;
    175 
    176     do {
    177       load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
    178       transpose_u8_8x4(&t0, &t1, &t2, &t3);
    179       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    180       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    181       tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    182       tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    183       s7 = vget_low_s16(tt0);
    184       s8 = vget_low_s16(tt1);
    185       s9 = vget_low_s16(tt2);
    186       s10 = vget_low_s16(tt3);
    187 
    188       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    189                        filter4);
    190       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    191                        filter4);
    192       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    193                        filter4);
    194       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    195                        filter4);
    196 
    197       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
    198       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
    199       transpose_u8_4x4(&d01, &d23);
    200 
    201       vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
    202                     vreinterpret_u32_u8(d01), 0);
    203       vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
    204                     vreinterpret_u32_u8(d23), 0);
    205       vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
    206                     vreinterpret_u32_u8(d01), 1);
    207       vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
    208                     vreinterpret_u32_u8(d23), 1);
    209 
    210       s0 = s4;
    211       s1 = s5;
    212       s2 = s6;
    213       s3 = s7;
    214       s4 = s8;
    215       s5 = s9;
    216       s6 = s10;
    217       src += 4;
    218       dst += 4;
    219       w -= 4;
    220     } while (w > 0);
    221   } else {
    222     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
    223     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
    224     int width;
    225     const uint8_t *s;
    226     uint8x8_t t4, t5, t6, t7;
    227     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    228 
    229     if (w == 4) {
    230       do {
    231         load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    232         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    233         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    234         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    235         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    236         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    237         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    238         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    239         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    240 
    241         load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    242         src += 8 * src_stride;
    243         __builtin_prefetch(dst + 0 * dst_stride);
    244         __builtin_prefetch(dst + 1 * dst_stride);
    245         __builtin_prefetch(dst + 2 * dst_stride);
    246         __builtin_prefetch(dst + 3 * dst_stride);
    247         __builtin_prefetch(dst + 4 * dst_stride);
    248         __builtin_prefetch(dst + 5 * dst_stride);
    249         __builtin_prefetch(dst + 6 * dst_stride);
    250         __builtin_prefetch(dst + 7 * dst_stride);
    251         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    252         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    253         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    254         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    255         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    256 
    257         __builtin_prefetch(src + 0 * src_stride);
    258         __builtin_prefetch(src + 1 * src_stride);
    259         __builtin_prefetch(src + 2 * src_stride);
    260         __builtin_prefetch(src + 3 * src_stride);
    261         __builtin_prefetch(src + 4 * src_stride);
    262         __builtin_prefetch(src + 5 * src_stride);
    263         __builtin_prefetch(src + 6 * src_stride);
    264         __builtin_prefetch(src + 7 * src_stride);
    265         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    266                          filter4);
    267         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    268                          filter4);
    269         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    270                          filter4);
    271         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    272                          filter4);
    273 
    274         t0 = vqrshrun_n_s16(d0, 7);
    275         t1 = vqrshrun_n_s16(d1, 7);
    276         t2 = vqrshrun_n_s16(d2, 7);
    277         t3 = vqrshrun_n_s16(d3, 7);
    278         transpose_u8_8x4(&t0, &t1, &t2, &t3);
    279         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
    280         dst += dst_stride;
    281         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
    282         dst += dst_stride;
    283         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
    284         dst += dst_stride;
    285         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
    286         dst += dst_stride;
    287         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
    288         dst += dst_stride;
    289         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
    290         dst += dst_stride;
    291         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
    292         dst += dst_stride;
    293         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
    294         dst += dst_stride;
    295         h -= 8;
    296       } while (h > 0);
    297     } else {
    298       uint8_t *d;
    299       int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
    300 
    301       do {
    302         __builtin_prefetch(src + 0 * src_stride);
    303         __builtin_prefetch(src + 1 * src_stride);
    304         __builtin_prefetch(src + 2 * src_stride);
    305         __builtin_prefetch(src + 3 * src_stride);
    306         __builtin_prefetch(src + 4 * src_stride);
    307         __builtin_prefetch(src + 5 * src_stride);
    308         __builtin_prefetch(src + 6 * src_stride);
    309         __builtin_prefetch(src + 7 * src_stride);
    310         load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    311         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    312         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    313         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    314         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    315         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    316         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    317         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    318         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    319 
    320         width = w;
    321         s = src + 7;
    322         d = dst;
    323         __builtin_prefetch(dst + 0 * dst_stride);
    324         __builtin_prefetch(dst + 1 * dst_stride);
    325         __builtin_prefetch(dst + 2 * dst_stride);
    326         __builtin_prefetch(dst + 3 * dst_stride);
    327         __builtin_prefetch(dst + 4 * dst_stride);
    328         __builtin_prefetch(dst + 5 * dst_stride);
    329         __builtin_prefetch(dst + 6 * dst_stride);
    330         __builtin_prefetch(dst + 7 * dst_stride);
    331 
    332         do {
    333           load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    334           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    335           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    336           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    337           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    338           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    339           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
    340           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
    341           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
    342           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
    343 
    344           d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    345                            filter4);
    346           d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    347                            filter4);
    348           d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    349                            filter4);
    350           d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    351                            filter4);
    352           d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
    353                            filter4);
    354           d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
    355                            filter4);
    356           d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
    357                            filter4);
    358           d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
    359                            filter3, filter4);
    360 
    361           t0 = vqrshrun_n_s16(d0, 7);
    362           t1 = vqrshrun_n_s16(d1, 7);
    363           t2 = vqrshrun_n_s16(d2, 7);
    364           t3 = vqrshrun_n_s16(d3, 7);
    365           t4 = vqrshrun_n_s16(d4, 7);
    366           t5 = vqrshrun_n_s16(d5, 7);
    367           t6 = vqrshrun_n_s16(d6, 7);
    368           t7 = vqrshrun_n_s16(d7, 7);
    369           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    370           store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
    371 
    372           s0 = s8;
    373           s1 = s9;
    374           s2 = s10;
    375           s3 = s11;
    376           s4 = s12;
    377           s5 = s13;
    378           s6 = s14;
    379           s += 8;
    380           d += 8;
    381           width -= 8;
    382         } while (width > 0);
    383         src += 8 * src_stride;
    384         dst += 8 * dst_stride;
    385         h -= 8;
    386       } while (h > 0);
    387     }
    388   }
    389 }
    390 
    391 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
    392                                   uint8_t *dst, ptrdiff_t dst_stride,
    393                                   const int16_t *filter_x, int x_step_q4,
    394                                   const int16_t *filter_y,  // unused
    395                                   int y_step_q4,            // unused
    396                                   int w, int h) {
    397   const int16x8_t filters = vld1q_s16(filter_x);
    398   uint8x8_t t0, t1, t2, t3;
    399 
    400   assert(!((intptr_t)dst & 3));
    401   assert(!(dst_stride & 3));
    402   assert(x_step_q4 == 16);
    403 
    404   (void)x_step_q4;
    405   (void)y_step_q4;
    406   (void)filter_y;
    407 
    408   src -= 3;
    409 
    410   if (h == 4) {
    411     uint8x8_t d01, d23;
    412     int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
    413         d1, d2, d3;
    414     int16x8_t tt0, tt1, tt2, tt3;
    415     uint32x4_t d0123 = vdupq_n_u32(0);
    416 
    417     __builtin_prefetch(src + 0 * src_stride);
    418     __builtin_prefetch(src + 1 * src_stride);
    419     __builtin_prefetch(src + 2 * src_stride);
    420     __builtin_prefetch(src + 3 * src_stride);
    421     filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
    422     filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
    423     load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
    424     transpose_u8_8x4(&t0, &t1, &t2, &t3);
    425     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    426     tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    427     tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    428     tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    429     s0 = vget_low_s16(tt0);
    430     s1 = vget_low_s16(tt1);
    431     s2 = vget_low_s16(tt2);
    432     s3 = vget_low_s16(tt3);
    433     s4 = vget_high_s16(tt0);
    434     s5 = vget_high_s16(tt1);
    435     s6 = vget_high_s16(tt2);
    436     __builtin_prefetch(dst + 0 * dst_stride);
    437     __builtin_prefetch(dst + 1 * dst_stride);
    438     __builtin_prefetch(dst + 2 * dst_stride);
    439     __builtin_prefetch(dst + 3 * dst_stride);
    440     src += 7;
    441 
    442     do {
    443       load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
    444       transpose_u8_8x4(&t0, &t1, &t2, &t3);
    445       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    446       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    447       tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    448       tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    449       s7 = vget_low_s16(tt0);
    450       s8 = vget_low_s16(tt1);
    451       s9 = vget_low_s16(tt2);
    452       s10 = vget_low_s16(tt3);
    453 
    454       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    455                        filter4);
    456       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    457                        filter4);
    458       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    459                        filter4);
    460       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    461                        filter4);
    462 
    463       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
    464       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
    465       transpose_u8_4x4(&d01, &d23);
    466 
    467       d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
    468       d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
    469       d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
    470       d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
    471       d0123 = vreinterpretq_u32_u8(
    472           vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
    473 
    474       vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
    475       vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
    476       vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
    477       vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
    478 
    479       s0 = s4;
    480       s1 = s5;
    481       s2 = s6;
    482       s3 = s7;
    483       s4 = s8;
    484       s5 = s9;
    485       s6 = s10;
    486       src += 4;
    487       dst += 4;
    488       w -= 4;
    489     } while (w > 0);
    490   } else {
    491     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
    492     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
    493     int width;
    494     const uint8_t *s;
    495     uint8x8_t t4, t5, t6, t7;
    496     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    497 
    498     if (w == 4) {
    499       uint32x4_t d0415 = vdupq_n_u32(0);
    500       uint32x4_t d2637 = vdupq_n_u32(0);
    501       do {
    502         load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    503         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    504         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    505         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    506         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    507         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    508         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    509         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    510         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    511 
    512         load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    513         src += 8 * src_stride;
    514         __builtin_prefetch(dst + 0 * dst_stride);
    515         __builtin_prefetch(dst + 1 * dst_stride);
    516         __builtin_prefetch(dst + 2 * dst_stride);
    517         __builtin_prefetch(dst + 3 * dst_stride);
    518         __builtin_prefetch(dst + 4 * dst_stride);
    519         __builtin_prefetch(dst + 5 * dst_stride);
    520         __builtin_prefetch(dst + 6 * dst_stride);
    521         __builtin_prefetch(dst + 7 * dst_stride);
    522         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    523         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    524         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    525         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    526         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    527 
    528         __builtin_prefetch(src + 0 * src_stride);
    529         __builtin_prefetch(src + 1 * src_stride);
    530         __builtin_prefetch(src + 2 * src_stride);
    531         __builtin_prefetch(src + 3 * src_stride);
    532         __builtin_prefetch(src + 4 * src_stride);
    533         __builtin_prefetch(src + 5 * src_stride);
    534         __builtin_prefetch(src + 6 * src_stride);
    535         __builtin_prefetch(src + 7 * src_stride);
    536         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    537                          filter4);
    538         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    539                          filter4);
    540         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    541                          filter4);
    542         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    543                          filter4);
    544 
    545         t0 = vqrshrun_n_s16(d0, 7);
    546         t1 = vqrshrun_n_s16(d1, 7);
    547         t2 = vqrshrun_n_s16(d2, 7);
    548         t3 = vqrshrun_n_s16(d3, 7);
    549         transpose_u8_8x4(&t0, &t1, &t2, &t3);
    550 
    551         d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
    552         d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
    553         d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
    554         d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
    555         d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
    556         d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
    557         d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
    558         d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
    559         d0415 = vreinterpretq_u32_u8(
    560             vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
    561         d2637 = vreinterpretq_u32_u8(
    562             vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
    563 
    564         vst1q_lane_u32((uint32_t *)dst, d0415, 0);
    565         dst += dst_stride;
    566         vst1q_lane_u32((uint32_t *)dst, d0415, 2);
    567         dst += dst_stride;
    568         vst1q_lane_u32((uint32_t *)dst, d2637, 0);
    569         dst += dst_stride;
    570         vst1q_lane_u32((uint32_t *)dst, d2637, 2);
    571         dst += dst_stride;
    572         vst1q_lane_u32((uint32_t *)dst, d0415, 1);
    573         dst += dst_stride;
    574         vst1q_lane_u32((uint32_t *)dst, d0415, 3);
    575         dst += dst_stride;
    576         vst1q_lane_u32((uint32_t *)dst, d2637, 1);
    577         dst += dst_stride;
    578         vst1q_lane_u32((uint32_t *)dst, d2637, 3);
    579         dst += dst_stride;
    580         h -= 8;
    581       } while (h > 0);
    582     } else {
    583       uint8_t *d;
    584       int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
    585       uint8x16_t d01, d23, d45, d67;
    586 
    587       do {
    588         __builtin_prefetch(src + 0 * src_stride);
    589         __builtin_prefetch(src + 1 * src_stride);
    590         __builtin_prefetch(src + 2 * src_stride);
    591         __builtin_prefetch(src + 3 * src_stride);
    592         __builtin_prefetch(src + 4 * src_stride);
    593         __builtin_prefetch(src + 5 * src_stride);
    594         __builtin_prefetch(src + 6 * src_stride);
    595         __builtin_prefetch(src + 7 * src_stride);
    596         load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    597         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    598         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    599         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    600         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    601         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    602         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    603         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    604         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    605 
    606         width = w;
    607         s = src + 7;
    608         d = dst;
    609         __builtin_prefetch(dst + 0 * dst_stride);
    610         __builtin_prefetch(dst + 1 * dst_stride);
    611         __builtin_prefetch(dst + 2 * dst_stride);
    612         __builtin_prefetch(dst + 3 * dst_stride);
    613         __builtin_prefetch(dst + 4 * dst_stride);
    614         __builtin_prefetch(dst + 5 * dst_stride);
    615         __builtin_prefetch(dst + 6 * dst_stride);
    616         __builtin_prefetch(dst + 7 * dst_stride);
    617 
    618         do {
    619           load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    620           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    621           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    622           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    623           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    624           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    625           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
    626           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
    627           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
    628           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
    629 
    630           d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    631                            filter4);
    632           d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    633                            filter4);
    634           d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    635                            filter4);
    636           d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    637                            filter4);
    638           d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
    639                            filter4);
    640           d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
    641                            filter4);
    642           d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
    643                            filter4);
    644           d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
    645                            filter3, filter4);
    646 
    647           t0 = vqrshrun_n_s16(d0, 7);
    648           t1 = vqrshrun_n_s16(d1, 7);
    649           t2 = vqrshrun_n_s16(d2, 7);
    650           t3 = vqrshrun_n_s16(d3, 7);
    651           t4 = vqrshrun_n_s16(d4, 7);
    652           t5 = vqrshrun_n_s16(d5, 7);
    653           t6 = vqrshrun_n_s16(d6, 7);
    654           t7 = vqrshrun_n_s16(d7, 7);
    655           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    656 
    657           d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
    658                             vld1_u8(d + 1 * dst_stride));
    659           d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
    660                             vld1_u8(d + 3 * dst_stride));
    661           d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
    662                             vld1_u8(d + 5 * dst_stride));
    663           d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
    664                             vld1_u8(d + 7 * dst_stride));
    665           d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
    666           d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
    667           d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
    668           d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
    669 
    670           store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
    671                     vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
    672                     vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
    673 
    674           s0 = s8;
    675           s1 = s9;
    676           s2 = s10;
    677           s3 = s11;
    678           s4 = s12;
    679           s5 = s13;
    680           s6 = s14;
    681           s += 8;
    682           d += 8;
    683           width -= 8;
    684         } while (width > 0);
    685         src += 8 * src_stride;
    686         dst += 8 * dst_stride;
    687         h -= 8;
    688       } while (h > 0);
    689     }
    690   }
    691 }
    692 
    693 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
    694                              uint8_t *dst, ptrdiff_t dst_stride,
    695                              const int16_t *filter_x,  // unused
    696                              int x_step_q4,            // unused
    697                              const int16_t *filter_y, int y_step_q4, int w,
    698                              int h) {
    699   const int16x8_t filters = vld1q_s16(filter_y);
    700 
    701   assert(!((intptr_t)dst & 3));
    702   assert(!(dst_stride & 3));
    703   assert(y_step_q4 == 16);
    704 
    705   (void)x_step_q4;
    706   (void)y_step_q4;
    707   (void)filter_x;
    708 
    709   src -= 3 * src_stride;
    710 
    711   if (w == 4) {
    712     const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
    713     const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
    714     uint8x8_t d01, d23;
    715     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    716 
    717     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    718     src += src_stride;
    719     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    720     src += src_stride;
    721     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    722     src += src_stride;
    723     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    724     src += src_stride;
    725     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    726     src += src_stride;
    727     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    728     src += src_stride;
    729     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    730     src += src_stride;
    731 
    732     do {
    733       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    734       src += src_stride;
    735       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    736       src += src_stride;
    737       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    738       src += src_stride;
    739       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    740       src += src_stride;
    741 
    742       __builtin_prefetch(dst + 0 * dst_stride);
    743       __builtin_prefetch(dst + 1 * dst_stride);
    744       __builtin_prefetch(dst + 2 * dst_stride);
    745       __builtin_prefetch(dst + 3 * dst_stride);
    746       __builtin_prefetch(src + 0 * src_stride);
    747       __builtin_prefetch(src + 1 * src_stride);
    748       __builtin_prefetch(src + 2 * src_stride);
    749       __builtin_prefetch(src + 3 * src_stride);
    750       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    751                        filter4);
    752       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    753                        filter4);
    754       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    755                        filter4);
    756       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    757                        filter4);
    758 
    759       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
    760       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
    761       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
    762       dst += dst_stride;
    763       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
    764       dst += dst_stride;
    765       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
    766       dst += dst_stride;
    767       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
    768       dst += dst_stride;
    769 
    770       s0 = s4;
    771       s1 = s5;
    772       s2 = s6;
    773       s3 = s7;
    774       s4 = s8;
    775       s5 = s9;
    776       s6 = s10;
    777       h -= 4;
    778     } while (h > 0);
    779   } else {
    780     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
    781     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
    782     int height;
    783     const uint8_t *s;
    784     uint8_t *d;
    785     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    786 
    787     do {
    788       __builtin_prefetch(src + 0 * src_stride);
    789       __builtin_prefetch(src + 1 * src_stride);
    790       __builtin_prefetch(src + 2 * src_stride);
    791       __builtin_prefetch(src + 3 * src_stride);
    792       __builtin_prefetch(src + 4 * src_stride);
    793       __builtin_prefetch(src + 5 * src_stride);
    794       __builtin_prefetch(src + 6 * src_stride);
    795       s = src;
    796       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    797       s += src_stride;
    798       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    799       s += src_stride;
    800       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    801       s += src_stride;
    802       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    803       s += src_stride;
    804       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    805       s += src_stride;
    806       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    807       s += src_stride;
    808       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    809       s += src_stride;
    810       d = dst;
    811       height = h;
    812 
    813       do {
    814         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    815         s += src_stride;
    816         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    817         s += src_stride;
    818         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    819         s += src_stride;
    820         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    821         s += src_stride;
    822 
    823         __builtin_prefetch(d + 0 * dst_stride);
    824         __builtin_prefetch(d + 1 * dst_stride);
    825         __builtin_prefetch(d + 2 * dst_stride);
    826         __builtin_prefetch(d + 3 * dst_stride);
    827         __builtin_prefetch(s + 0 * src_stride);
    828         __builtin_prefetch(s + 1 * src_stride);
    829         __builtin_prefetch(s + 2 * src_stride);
    830         __builtin_prefetch(s + 3 * src_stride);
    831         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    832                          filter4);
    833         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    834                          filter4);
    835         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    836                          filter4);
    837         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    838                          filter4);
    839 
    840         vst1_u8(d, vqrshrun_n_s16(d0, 7));
    841         d += dst_stride;
    842         vst1_u8(d, vqrshrun_n_s16(d1, 7));
    843         d += dst_stride;
    844         vst1_u8(d, vqrshrun_n_s16(d2, 7));
    845         d += dst_stride;
    846         vst1_u8(d, vqrshrun_n_s16(d3, 7));
    847         d += dst_stride;
    848 
    849         s0 = s4;
    850         s1 = s5;
    851         s2 = s6;
    852         s3 = s7;
    853         s4 = s8;
    854         s5 = s9;
    855         s6 = s10;
    856         height -= 4;
    857       } while (height > 0);
    858       src += 8;
    859       dst += 8;
    860       w -= 8;
    861     } while (w > 0);
    862   }
    863 }
    864 
    865 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
    866                                  uint8_t *dst, ptrdiff_t dst_stride,
    867                                  const int16_t *filter_x,  // unused
    868                                  int x_step_q4,            // unused
    869                                  const int16_t *filter_y, int y_step_q4, int w,
    870                                  int h) {
    871   const int16x8_t filters = vld1q_s16(filter_y);
    872 
    873   assert(!((intptr_t)dst & 3));
    874   assert(!(dst_stride & 3));
    875   assert(y_step_q4 == 16);
    876 
    877   (void)x_step_q4;
    878   (void)y_step_q4;
    879   (void)filter_x;
    880 
    881   src -= 3 * src_stride;
    882 
    883   if (w == 4) {
    884     const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
    885     const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
    886     uint8x8_t d01, d23;
    887     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    888     uint32x4_t d0123 = vdupq_n_u32(0);
    889 
    890     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    891     src += src_stride;
    892     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    893     src += src_stride;
    894     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    895     src += src_stride;
    896     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    897     src += src_stride;
    898     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    899     src += src_stride;
    900     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    901     src += src_stride;
    902     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    903     src += src_stride;
    904 
    905     do {
    906       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    907       src += src_stride;
    908       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    909       src += src_stride;
    910       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    911       src += src_stride;
    912       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    913       src += src_stride;
    914 
    915       __builtin_prefetch(dst + 0 * dst_stride);
    916       __builtin_prefetch(dst + 1 * dst_stride);
    917       __builtin_prefetch(dst + 2 * dst_stride);
    918       __builtin_prefetch(dst + 3 * dst_stride);
    919       __builtin_prefetch(src + 0 * src_stride);
    920       __builtin_prefetch(src + 1 * src_stride);
    921       __builtin_prefetch(src + 2 * src_stride);
    922       __builtin_prefetch(src + 3 * src_stride);
    923       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
    924                        filter4);
    925       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
    926                        filter4);
    927       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
    928                        filter4);
    929       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
    930                        filter4);
    931 
    932       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
    933       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
    934 
    935       d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
    936       d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
    937       d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
    938       d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
    939       d0123 = vreinterpretq_u32_u8(
    940           vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
    941 
    942       vst1q_lane_u32((uint32_t *)dst, d0123, 0);
    943       dst += dst_stride;
    944       vst1q_lane_u32((uint32_t *)dst, d0123, 1);
    945       dst += dst_stride;
    946       vst1q_lane_u32((uint32_t *)dst, d0123, 2);
    947       dst += dst_stride;
    948       vst1q_lane_u32((uint32_t *)dst, d0123, 3);
    949       dst += dst_stride;
    950 
    951       s0 = s4;
    952       s1 = s5;
    953       s2 = s6;
    954       s3 = s7;
    955       s4 = s8;
    956       s5 = s9;
    957       s6 = s10;
    958       h -= 4;
    959     } while (h > 0);
    960   } else {
    961     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
    962     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
    963     int height;
    964     const uint8_t *s;
    965     uint8_t *d;
    966     uint8x16_t d01, d23, dd01, dd23;
    967     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    968 
    969     do {
    970       __builtin_prefetch(src + 0 * src_stride);
    971       __builtin_prefetch(src + 1 * src_stride);
    972       __builtin_prefetch(src + 2 * src_stride);
    973       __builtin_prefetch(src + 3 * src_stride);
    974       __builtin_prefetch(src + 4 * src_stride);
    975       __builtin_prefetch(src + 5 * src_stride);
    976       __builtin_prefetch(src + 6 * src_stride);
    977       s = src;
    978       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    979       s += src_stride;
    980       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    981       s += src_stride;
    982       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    983       s += src_stride;
    984       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    985       s += src_stride;
    986       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    987       s += src_stride;
    988       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    989       s += src_stride;
    990       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    991       s += src_stride;
    992       d = dst;
    993       height = h;
    994 
    995       do {
    996         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    997         s += src_stride;
    998         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    999         s += src_stride;
   1000         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   1001         s += src_stride;
   1002         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   1003         s += src_stride;
   1004 
   1005         __builtin_prefetch(d + 0 * dst_stride);
   1006         __builtin_prefetch(d + 1 * dst_stride);
   1007         __builtin_prefetch(d + 2 * dst_stride);
   1008         __builtin_prefetch(d + 3 * dst_stride);
   1009         __builtin_prefetch(s + 0 * src_stride);
   1010         __builtin_prefetch(s + 1 * src_stride);
   1011         __builtin_prefetch(s + 2 * src_stride);
   1012         __builtin_prefetch(s + 3 * src_stride);
   1013         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
   1014                          filter4);
   1015         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
   1016                          filter4);
   1017         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
   1018                          filter4);
   1019         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
   1020                          filter4);
   1021 
   1022         d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7));
   1023         d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7));
   1024         dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
   1025                            vld1_u8(d + 1 * dst_stride));
   1026         dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
   1027                            vld1_u8(d + 3 * dst_stride));
   1028         dd01 = vrhaddq_u8(dd01, d01);
   1029         dd23 = vrhaddq_u8(dd23, d23);
   1030 
   1031         vst1_u8(d, vget_low_u8(dd01));
   1032         d += dst_stride;
   1033         vst1_u8(d, vget_high_u8(dd01));
   1034         d += dst_stride;
   1035         vst1_u8(d, vget_low_u8(dd23));
   1036         d += dst_stride;
   1037         vst1_u8(d, vget_high_u8(dd23));
   1038         d += dst_stride;
   1039 
   1040         s0 = s4;
   1041         s1 = s5;
   1042         s2 = s6;
   1043         s3 = s7;
   1044         s4 = s8;
   1045         s5 = s9;
   1046         s6 = s10;
   1047         height -= 4;
   1048       } while (height > 0);
   1049       src += 8;
   1050       dst += 8;
   1051       w -= 8;
   1052     } while (w > 0);
   1053   }
   1054 }
   1055