Home | History | Annotate | Download | only in arm
      1 /*
      2  *
      3  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
      4  *
      5  * This source code is subject to the terms of the BSD 2 Clause License and
      6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7  * was not distributed with this source code in the LICENSE file, you can
      8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9  * Media Patent License 1.0 was not distributed with this source code in the
     10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11  */
     12 
     13 #include <assert.h>
     14 #include <arm_neon.h>
     15 
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_ports/mem.h"
     20 #include "av1/common/convolve.h"
     21 #include "av1/common/filter.h"
     22 #include "av1/common/arm/convolve_neon.h"
     23 #include "av1/common/arm/mem_neon.h"
     24 #include "av1/common/arm/transpose_neon.h"
     25 
     26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
     27                                       const int16x4_t s2, const int16x4_t s3,
     28                                       const int16x4_t s4, const int16x4_t s5,
     29                                       const int16x4_t s6, const int16x4_t s7,
     30                                       const int16_t *filter) {
     31   int16x4_t sum;
     32 
     33   sum = vmul_n_s16(s0, filter[0]);
     34   sum = vmla_n_s16(sum, s1, filter[1]);
     35   sum = vmla_n_s16(sum, s2, filter[2]);
     36   sum = vmla_n_s16(sum, s5, filter[5]);
     37   sum = vmla_n_s16(sum, s6, filter[6]);
     38   sum = vmla_n_s16(sum, s7, filter[7]);
     39   /* filter[3] can take a max value of 128. So the max value of the result :
     40    * 128*255 + sum > 16 bits
     41    */
     42   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
     43   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
     44 
     45   return sum;
     46 }
     47 
     48 static INLINE uint8x8_t convolve8_horiz_8x8(
     49     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     50     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     51     const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
     52     const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
     53   int16x8_t sum;
     54 
     55   sum = vmulq_n_s16(s0, filter[0]);
     56   sum = vmlaq_n_s16(sum, s1, filter[1]);
     57   sum = vmlaq_n_s16(sum, s2, filter[2]);
     58   sum = vmlaq_n_s16(sum, s5, filter[5]);
     59   sum = vmlaq_n_s16(sum, s6, filter[6]);
     60   sum = vmlaq_n_s16(sum, s7, filter[7]);
     61   /* filter[3] can take a max value of 128. So the max value of the result :
     62    * 128*255 + sum > 16 bits
     63    */
     64   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
     65   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
     66 
     67   sum = vqrshlq_s16(sum, shift_round_0);
     68   sum = vqrshlq_s16(sum, shift_by_bits);
     69 
     70   return vqmovun_s16(sum);
     71 }
     72 
     73 #if !defined(__aarch64__)
     74 static INLINE uint8x8_t convolve8_horiz_4x1(
     75     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     76     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     77     const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
     78     const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
     79   int16x4_t sum;
     80 
     81   sum = vmul_n_s16(s0, filter[0]);
     82   sum = vmla_n_s16(sum, s1, filter[1]);
     83   sum = vmla_n_s16(sum, s2, filter[2]);
     84   sum = vmla_n_s16(sum, s5, filter[5]);
     85   sum = vmla_n_s16(sum, s6, filter[6]);
     86   sum = vmla_n_s16(sum, s7, filter[7]);
     87   /* filter[3] can take a max value of 128. So the max value of the result :
     88    * 128*255 + sum > 16 bits
     89    */
     90   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
     91   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
     92 
     93   sum = vqrshl_s16(sum, shift_round_0);
     94   sum = vqrshl_s16(sum, shift_by_bits);
     95 
     96   return vqmovun_s16(vcombine_s16(sum, sum));
     97 }
     98 #endif  // !defined(__arch64__)
     99 
    100 static INLINE uint8x8_t convolve8_vert_8x4(
    101     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    102     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    103     const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
    104   int16x8_t sum;
    105 
    106   sum = vmulq_n_s16(s0, filter[0]);
    107   sum = vmlaq_n_s16(sum, s1, filter[1]);
    108   sum = vmlaq_n_s16(sum, s2, filter[2]);
    109   sum = vmlaq_n_s16(sum, s5, filter[5]);
    110   sum = vmlaq_n_s16(sum, s6, filter[6]);
    111   sum = vmlaq_n_s16(sum, s7, filter[7]);
    112   /* filter[3] can take a max value of 128. So the max value of the result :
    113    * 128*255 + sum > 16 bits
    114    */
    115   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
    116   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
    117 
    118   return vqrshrun_n_s16(sum, FILTER_BITS);
    119 }
    120 
    121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
    122     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    123     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
    124     const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
    125     const int32x4_t round_shift_vec, const int32x4_t offset_const,
    126     const int32x4_t sub_const_vec) {
    127   int32x4_t sum0;
    128   uint16x4_t res;
    129   const int32x4_t zero = vdupq_n_s32(0);
    130 
    131   sum0 = vmull_n_s16(s0, y_filter[0]);
    132   sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
    133   sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
    134   sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
    135   sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
    136   sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
    137   sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
    138   sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
    139 
    140   sum0 = vaddq_s32(sum0, offset_const);
    141   sum0 = vqrshlq_s32(sum0, round_shift_vec);
    142   sum0 = vsubq_s32(sum0, sub_const_vec);
    143   sum0 = vmaxq_s32(sum0, zero);
    144 
    145   res = vmovn_u32(vreinterpretq_u32_s32(sum0));
    146 
    147   return res;
    148 }
    149 
    150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
    151     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    152     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    153     const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
    154     const int32x4_t round_shift_vec, const int32x4_t offset_const,
    155     const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
    156   int32x4_t sum0, sum1;
    157   uint16x8_t res;
    158   const int32x4_t zero = vdupq_n_s32(0);
    159 
    160   sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
    161   sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
    162   sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
    163   sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
    164   sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
    165   sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
    166   sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
    167   sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
    168 
    169   sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
    170   sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
    171   sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
    172   sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
    173   sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
    174   sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
    175   sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
    176   sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
    177 
    178   sum0 = vaddq_s32(sum0, offset_const);
    179   sum1 = vaddq_s32(sum1, offset_const);
    180   sum0 = vqrshlq_s32(sum0, round_shift_vec);
    181   sum1 = vqrshlq_s32(sum1, round_shift_vec);
    182   sum0 = vsubq_s32(sum0, sub_const_vec);
    183   sum1 = vsubq_s32(sum1, sub_const_vec);
    184   sum0 = vmaxq_s32(sum0, zero);
    185   sum1 = vmaxq_s32(sum1, zero);
    186   res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
    187                      vqmovn_u32(vreinterpretq_u32_s32(sum1)));
    188 
    189   res = vqrshlq_u16(res, vec_round_bits);
    190 
    191   return vqmovn_u16(res);
    192 }
    193 
    194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
    195                             int dst_stride, int w, int h,
    196                             const InterpFilterParams *filter_params_x,
    197                             const InterpFilterParams *filter_params_y,
    198                             const int subpel_x_q4, const int subpel_y_q4,
    199                             ConvolveParams *conv_params) {
    200   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
    201   const int8_t bits = FILTER_BITS - conv_params->round_0;
    202 
    203   (void)subpel_y_q4;
    204   (void)conv_params;
    205   (void)filter_params_y;
    206 
    207   uint8x8_t t0;
    208 #if defined(__aarch64__)
    209   uint8x8_t t1, t2, t3;
    210 #endif
    211 
    212   assert(bits >= 0);
    213   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    214          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    215 
    216   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    217       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    218 
    219   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
    220   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
    221 
    222   src -= horiz_offset;
    223 #if defined(__aarch64__)
    224   if (h == 4) {
    225     uint8x8_t d01, d23;
    226     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
    227     int16x8_t d01_temp, d23_temp;
    228 
    229     __builtin_prefetch(src + 0 * src_stride);
    230     __builtin_prefetch(src + 1 * src_stride);
    231     __builtin_prefetch(src + 2 * src_stride);
    232     __builtin_prefetch(src + 3 * src_stride);
    233 
    234     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
    235     transpose_u8_8x4(&t0, &t1, &t2, &t3);
    236 
    237     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    238     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    239     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    240     s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
    241     s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    242     s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    243     s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    244     __builtin_prefetch(dst + 0 * dst_stride);
    245     __builtin_prefetch(dst + 1 * dst_stride);
    246     __builtin_prefetch(dst + 2 * dst_stride);
    247     __builtin_prefetch(dst + 3 * dst_stride);
    248     src += 7;
    249 
    250     do {
    251       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
    252       transpose_u8_8x4(&t0, &t1, &t2, &t3);
    253 
    254       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    255       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    256       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    257       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
    258 
    259       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
    260 
    261       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
    262 
    263       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
    264 
    265       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
    266 
    267       d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
    268       d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
    269 
    270       d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
    271       d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
    272 
    273       d01 = vqmovun_s16(d01_temp);
    274       d23 = vqmovun_s16(d23_temp);
    275 
    276       transpose_u8_4x4(&d01, &d23);
    277 
    278       if (w != 2) {
    279         vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
    280                       vreinterpret_u32_u8(d01), 0);
    281         vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
    282                       vreinterpret_u32_u8(d23), 0);
    283         vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
    284                       vreinterpret_u32_u8(d01), 1);
    285         vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
    286                       vreinterpret_u32_u8(d23), 1);
    287       } else {
    288         vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
    289                       vreinterpret_u16_u8(d01), 0);
    290         vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
    291                       vreinterpret_u16_u8(d23), 0);
    292         vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
    293                       vreinterpret_u16_u8(d01), 2);
    294         vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
    295                       vreinterpret_u16_u8(d23), 2);
    296       }
    297 
    298       s0 = s4;
    299       s1 = s5;
    300       s2 = s6;
    301       s3 = s7;
    302       s4 = s8;
    303       s5 = s9;
    304       s6 = s10;
    305       src += 4;
    306       dst += 4;
    307       w -= 4;
    308     } while (w > 0);
    309   } else {
    310 #endif
    311     int width;
    312     const uint8_t *s;
    313     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    314 
    315 #if defined(__aarch64__)
    316     int16x8_t s8, s9, s10;
    317     uint8x8_t t4, t5, t6, t7;
    318 #endif
    319 
    320     if (w <= 4) {
    321 #if defined(__aarch64__)
    322       do {
    323         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    324         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    325         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    326         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    327         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    328         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    329         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    330         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    331         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    332 
    333         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
    334                     &t7);
    335         src += 8 * src_stride;
    336         __builtin_prefetch(dst + 0 * dst_stride);
    337         __builtin_prefetch(dst + 1 * dst_stride);
    338         __builtin_prefetch(dst + 2 * dst_stride);
    339         __builtin_prefetch(dst + 3 * dst_stride);
    340         __builtin_prefetch(dst + 4 * dst_stride);
    341         __builtin_prefetch(dst + 5 * dst_stride);
    342         __builtin_prefetch(dst + 6 * dst_stride);
    343         __builtin_prefetch(dst + 7 * dst_stride);
    344 
    345         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
    346 
    347         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    348         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    349         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    350         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    351 
    352         __builtin_prefetch(src + 0 * src_stride);
    353         __builtin_prefetch(src + 1 * src_stride);
    354         __builtin_prefetch(src + 2 * src_stride);
    355         __builtin_prefetch(src + 3 * src_stride);
    356         __builtin_prefetch(src + 4 * src_stride);
    357         __builtin_prefetch(src + 5 * src_stride);
    358         __builtin_prefetch(src + 6 * src_stride);
    359         __builtin_prefetch(src + 7 * src_stride);
    360         t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
    361                                  shift_round_0, shift_by_bits);
    362         t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
    363                                  shift_round_0, shift_by_bits);
    364         t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
    365                                  shift_round_0, shift_by_bits);
    366         t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
    367                                  shift_round_0, shift_by_bits);
    368 
    369         transpose_u8_8x4(&t0, &t1, &t2, &t3);
    370 
    371         if ((w == 4) && (h > 4)) {
    372           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
    373                         0);  // 00 01 02 03
    374           dst += dst_stride;
    375           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
    376                         0);  // 10 11 12 13
    377           dst += dst_stride;
    378           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
    379                         0);  // 20 21 22 23
    380           dst += dst_stride;
    381           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
    382                         0);  // 30 31 32 33
    383           dst += dst_stride;
    384           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
    385                         1);  // 40 41 42 43
    386           dst += dst_stride;
    387           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
    388                         1);  // 50 51 52 53
    389           dst += dst_stride;
    390           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
    391                         1);  // 60 61 62 63
    392           dst += dst_stride;
    393           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
    394                         1);  // 70 71 72 73
    395           dst += dst_stride;
    396         } else if ((w == 4) && (h == 2)) {
    397           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
    398                         0);  // 00 01 02 03
    399           dst += dst_stride;
    400           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
    401                         0);  // 10 11 12 13
    402           dst += dst_stride;
    403         } else if ((w == 2) && (h > 4)) {
    404           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
    405           dst += dst_stride;
    406           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
    407           dst += dst_stride;
    408           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
    409           dst += dst_stride;
    410           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
    411           dst += dst_stride;
    412           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
    413           dst += dst_stride;
    414           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
    415           dst += dst_stride;
    416           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
    417           dst += dst_stride;
    418           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
    419           dst += dst_stride;
    420         } else if ((w == 2) && (h == 2)) {
    421           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
    422           dst += dst_stride;
    423           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
    424           dst += dst_stride;
    425         }
    426         h -= 8;
    427       } while (h > 0);
    428 #else
    429     int16x8_t tt0;
    430     int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
    431     const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
    432     const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
    433     do {
    434       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
    435       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    436       x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
    437       x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
    438 
    439       t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
    440       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    441       x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
    442 
    443       x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
    444       x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
    445       x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
    446       x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
    447       x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
    448       x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
    449 
    450       src += src_stride;
    451 
    452       t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
    453                                shift_round_0_low, shift_by_bits_low);
    454 
    455       if (w == 4) {
    456         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
    457                       0);  // 00 01 02 03
    458         dst += dst_stride;
    459       } else if (w == 2) {
    460         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
    461         dst += dst_stride;
    462       }
    463       h -= 1;
    464     } while (h > 0);
    465 #endif
    466     } else {
    467       uint8_t *d;
    468       int16x8_t s11;
    469 #if defined(__aarch64__)
    470       int16x8_t s12, s13, s14;
    471       do {
    472         __builtin_prefetch(src + 0 * src_stride);
    473         __builtin_prefetch(src + 1 * src_stride);
    474         __builtin_prefetch(src + 2 * src_stride);
    475         __builtin_prefetch(src + 3 * src_stride);
    476         __builtin_prefetch(src + 4 * src_stride);
    477         __builtin_prefetch(src + 5 * src_stride);
    478         __builtin_prefetch(src + 6 * src_stride);
    479         __builtin_prefetch(src + 7 * src_stride);
    480         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    481         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    482         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    483         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    484         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    485         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    486         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    487         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    488         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    489 
    490         width = w;
    491         s = src + 7;
    492         d = dst;
    493         __builtin_prefetch(dst + 0 * dst_stride);
    494         __builtin_prefetch(dst + 1 * dst_stride);
    495         __builtin_prefetch(dst + 2 * dst_stride);
    496         __builtin_prefetch(dst + 3 * dst_stride);
    497         __builtin_prefetch(dst + 4 * dst_stride);
    498         __builtin_prefetch(dst + 5 * dst_stride);
    499         __builtin_prefetch(dst + 6 * dst_stride);
    500         __builtin_prefetch(dst + 7 * dst_stride);
    501 
    502         do {
    503           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    504           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    505           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    506           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    507           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    508           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    509           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
    510           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
    511           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
    512           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
    513 
    514           t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
    515                                    shift_round_0, shift_by_bits);
    516 
    517           t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
    518                                    shift_round_0, shift_by_bits);
    519 
    520           t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
    521                                    shift_round_0, shift_by_bits);
    522 
    523           t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
    524                                    shift_round_0, shift_by_bits);
    525 
    526           t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
    527                                    shift_round_0, shift_by_bits);
    528 
    529           t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
    530                                    shift_round_0, shift_by_bits);
    531 
    532           t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
    533                                    shift_round_0, shift_by_bits);
    534 
    535           t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
    536                                    x_filter, shift_round_0, shift_by_bits);
    537 
    538           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    539           if (h != 2) {
    540             store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
    541           } else {
    542             store_row2_u8_8x8(d, dst_stride, t0, t1);
    543           }
    544           s0 = s8;
    545           s1 = s9;
    546           s2 = s10;
    547           s3 = s11;
    548           s4 = s12;
    549           s5 = s13;
    550           s6 = s14;
    551           s += 8;
    552           d += 8;
    553           width -= 8;
    554         } while (width > 0);
    555         src += 8 * src_stride;
    556         dst += 8 * dst_stride;
    557         h -= 8;
    558       } while (h > 0);
    559 #else
    560     do {
    561       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
    562       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    563 
    564       width = w;
    565       s = src + 8;
    566       d = dst;
    567       __builtin_prefetch(dst);
    568 
    569       do {
    570         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
    571         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    572         s11 = s0;
    573         s0 = s7;
    574 
    575         s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
    576         s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
    577         s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
    578         s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
    579         s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
    580         s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
    581         s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
    582 
    583         t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
    584                                  shift_round_0, shift_by_bits);
    585         vst1_u8(d, t0);
    586 
    587         s += 8;
    588         d += 8;
    589         width -= 8;
    590       } while (width > 0);
    591       src += src_stride;
    592       dst += dst_stride;
    593       h -= 1;
    594     } while (h > 0);
    595 #endif
    596     }
    597 #if defined(__aarch64__)
    598   }
    599 #endif
    600 }
    601 
    602 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
    603                             int dst_stride, int w, int h,
    604                             const InterpFilterParams *filter_params_x,
    605                             const InterpFilterParams *filter_params_y,
    606                             const int subpel_x_q4, const int subpel_y_q4,
    607                             ConvolveParams *conv_params) {
    608   const int vert_offset = filter_params_y->taps / 2 - 1;
    609 
    610   src -= vert_offset * src_stride;
    611 
    612   (void)filter_params_x;
    613   (void)subpel_x_q4;
    614   (void)conv_params;
    615 
    616   assert(conv_params->round_0 <= FILTER_BITS);
    617   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
    618          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
    619 
    620   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    621       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    622 
    623   if (w <= 4) {
    624     uint8x8_t d01;
    625     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
    626 #if defined(__aarch64__)
    627     uint8x8_t d23;
    628     int16x4_t s8, s9, s10, d1, d2, d3;
    629 #endif
    630     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    631     src += src_stride;
    632     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    633     src += src_stride;
    634     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    635     src += src_stride;
    636     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    637     src += src_stride;
    638     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    639     src += src_stride;
    640     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    641     src += src_stride;
    642     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    643     src += src_stride;
    644 
    645     do {
    646       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    647       src += src_stride;
    648 #if defined(__aarch64__)
    649       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    650       src += src_stride;
    651       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    652       src += src_stride;
    653       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
    654       src += src_stride;
    655 
    656       __builtin_prefetch(dst + 0 * dst_stride);
    657       __builtin_prefetch(dst + 1 * dst_stride);
    658       __builtin_prefetch(dst + 2 * dst_stride);
    659       __builtin_prefetch(dst + 3 * dst_stride);
    660       __builtin_prefetch(src + 0 * src_stride);
    661       __builtin_prefetch(src + 1 * src_stride);
    662       __builtin_prefetch(src + 2 * src_stride);
    663       __builtin_prefetch(src + 3 * src_stride);
    664       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    665       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
    666       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
    667       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
    668 
    669       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
    670       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
    671       if ((w == 4) && (h != 2)) {
    672         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
    673                       0);  // 00 01 02 03
    674         dst += dst_stride;
    675         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
    676                       1);  // 10 11 12 13
    677         dst += dst_stride;
    678         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
    679                       0);  // 20 21 22 23
    680         dst += dst_stride;
    681         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
    682                       1);  // 30 31 32 33
    683         dst += dst_stride;
    684       } else if ((w == 4) && (h == 2)) {
    685         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
    686                       0);  // 00 01 02 03
    687         dst += dst_stride;
    688         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
    689                       1);  // 10 11 12 13
    690         dst += dst_stride;
    691       } else if ((w == 2) && (h != 2)) {
    692         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
    693         dst += dst_stride;
    694         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
    695         dst += dst_stride;
    696         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
    697         dst += dst_stride;
    698         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
    699         dst += dst_stride;
    700       } else if ((w == 2) && (h == 2)) {
    701         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
    702         dst += dst_stride;
    703         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
    704         dst += dst_stride;
    705       }
    706       s0 = s4;
    707       s1 = s5;
    708       s2 = s6;
    709       s3 = s7;
    710       s4 = s8;
    711       s5 = s9;
    712       s6 = s10;
    713       h -= 4;
    714 #else
    715       __builtin_prefetch(dst + 0 * dst_stride);
    716       __builtin_prefetch(src + 0 * src_stride);
    717 
    718       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    719 
    720       d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
    721 
    722       if (w == 4) {
    723         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
    724         dst += dst_stride;
    725       } else if (w == 2) {
    726         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
    727         dst += dst_stride;
    728       }
    729       s0 = s1;
    730       s1 = s2;
    731       s2 = s3;
    732       s3 = s4;
    733       s4 = s5;
    734       s5 = s6;
    735       s6 = s7;
    736       h -= 1;
    737 #endif
    738     } while (h > 0);
    739   } else {
    740     int height;
    741     const uint8_t *s;
    742     uint8_t *d;
    743     uint8x8_t t0;
    744     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    745 #if defined(__aarch64__)
    746     uint8x8_t t1, t2, t3;
    747     int16x8_t s8, s9, s10;
    748 #endif
    749     do {
    750       __builtin_prefetch(src + 0 * src_stride);
    751       __builtin_prefetch(src + 1 * src_stride);
    752       __builtin_prefetch(src + 2 * src_stride);
    753       __builtin_prefetch(src + 3 * src_stride);
    754       __builtin_prefetch(src + 4 * src_stride);
    755       __builtin_prefetch(src + 5 * src_stride);
    756       __builtin_prefetch(src + 6 * src_stride);
    757       s = src;
    758       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    759       s += src_stride;
    760       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    761       s += src_stride;
    762       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    763       s += src_stride;
    764       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    765       s += src_stride;
    766       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    767       s += src_stride;
    768       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    769       s += src_stride;
    770       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    771       s += src_stride;
    772       d = dst;
    773       height = h;
    774 
    775       do {
    776         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    777         s += src_stride;
    778 #if defined(__aarch64__)
    779         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    780         s += src_stride;
    781         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    782         s += src_stride;
    783         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    784         s += src_stride;
    785 
    786         __builtin_prefetch(d + 0 * dst_stride);
    787         __builtin_prefetch(d + 1 * dst_stride);
    788         __builtin_prefetch(d + 2 * dst_stride);
    789         __builtin_prefetch(d + 3 * dst_stride);
    790         __builtin_prefetch(s + 0 * src_stride);
    791         __builtin_prefetch(s + 1 * src_stride);
    792         __builtin_prefetch(s + 2 * src_stride);
    793         __builtin_prefetch(s + 3 * src_stride);
    794         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    795         t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
    796         t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
    797         t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
    798         if (h != 2) {
    799           vst1_u8(d, t0);
    800           d += dst_stride;
    801           vst1_u8(d, t1);
    802           d += dst_stride;
    803           vst1_u8(d, t2);
    804           d += dst_stride;
    805           vst1_u8(d, t3);
    806           d += dst_stride;
    807         } else {
    808           vst1_u8(d, t0);
    809           d += dst_stride;
    810           vst1_u8(d, t1);
    811           d += dst_stride;
    812         }
    813         s0 = s4;
    814         s1 = s5;
    815         s2 = s6;
    816         s3 = s7;
    817         s4 = s8;
    818         s5 = s9;
    819         s6 = s10;
    820         height -= 4;
    821 #else
    822         __builtin_prefetch(d);
    823         __builtin_prefetch(s);
    824 
    825         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    826 
    827         vst1_u8(d, t0);
    828         d += dst_stride;
    829 
    830         s0 = s1;
    831         s1 = s2;
    832         s2 = s3;
    833         s3 = s4;
    834         s4 = s5;
    835         s5 = s6;
    836         s6 = s7;
    837         height -= 1;
    838 #endif
    839       } while (height > 0);
    840       src += 8;
    841       dst += 8;
    842       w -= 8;
    843     } while (w > 0);
    844   }
    845 }
    846 
    847 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
    848                              int dst_stride, int w, int h,
    849                              const InterpFilterParams *filter_params_x,
    850                              const InterpFilterParams *filter_params_y,
    851                              const int subpel_x_q4, const int subpel_y_q4,
    852                              ConvolveParams *conv_params) {
    853   int im_dst_stride;
    854   int width, height;
    855   uint8x8_t t0;
    856 #if defined(__aarch64__)
    857   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
    858 #endif
    859 
    860   DECLARE_ALIGNED(16, int16_t,
    861                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
    862 
    863   const int bd = 8;
    864   const int im_h = h + filter_params_y->taps - 1;
    865   const int im_stride = MAX_SB_SIZE;
    866   const int vert_offset = filter_params_y->taps / 2 - 1;
    867   const int horiz_offset = filter_params_x->taps / 2 - 1;
    868 
    869   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
    870   const uint8_t *s;
    871   int16_t *dst_ptr;
    872 
    873   dst_ptr = im_block;
    874   im_dst_stride = im_stride;
    875   height = im_h;
    876   width = w;
    877 
    878   const int16_t round_bits =
    879       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    880   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
    881   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    882   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    883       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    884 
    885   int16_t x_filter_tmp[8];
    886   int16x8_t filter_x_coef = vld1q_s16(x_filter);
    887 
    888   // filter coeffs are even, so downshifting by 1 to reduce intermediate
    889   // precision requirements.
    890   filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
    891   vst1q_s16(&x_filter_tmp[0], filter_x_coef);
    892 
    893   assert(conv_params->round_0 > 0);
    894 
    895   if (w <= 4) {
    896     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
    897 #if defined(__aarch64__)
    898     int16x4_t s8, s9, s10, d1, d2, d3;
    899 #endif
    900 
    901     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
    902     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
    903 
    904     do {
    905       s = src_ptr;
    906 
    907 #if defined(__aarch64__)
    908       __builtin_prefetch(s + 0 * src_stride);
    909       __builtin_prefetch(s + 1 * src_stride);
    910       __builtin_prefetch(s + 2 * src_stride);
    911       __builtin_prefetch(s + 3 * src_stride);
    912 
    913       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
    914       transpose_u8_8x4(&t0, &t1, &t2, &t3);
    915 
    916       s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    917       s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    918       s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    919       s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
    920       s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    921       s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    922       s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    923 
    924       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
    925       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
    926       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
    927       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
    928       s += 7;
    929 
    930       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
    931       transpose_u8_8x4(&t0, &t1, &t2, &t3);
    932 
    933       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    934       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    935       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    936       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
    937 
    938       d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
    939                              horiz_const, shift_round_0);
    940       d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
    941                              horiz_const, shift_round_0);
    942       d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
    943                              horiz_const, shift_round_0);
    944       d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
    945                              horiz_const, shift_round_0);
    946 
    947       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
    948       if (w == 4) {
    949         vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
    950         vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
    951         vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
    952         vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
    953       } else if (w == 2) {
    954         vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
    955                       vreinterpret_u32_s16(d0), 0);
    956         vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
    957                       vreinterpret_u32_s16(d1), 0);
    958         vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
    959                       vreinterpret_u32_s16(d2), 0);
    960         vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
    961                       vreinterpret_u32_s16(d3), 0);
    962       }
    963       src_ptr += 4 * src_stride;
    964       dst_ptr += 4 * im_dst_stride;
    965       height -= 4;
    966 #else
    967       int16x8_t tt0;
    968 
    969       __builtin_prefetch(s);
    970 
    971       t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
    972       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    973       s0 = vget_low_s16(tt0);
    974       s4 = vget_high_s16(tt0);
    975 
    976       __builtin_prefetch(dst_ptr);
    977       s += 8;
    978 
    979       t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
    980       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    981 
    982       s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
    983       s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
    984       s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
    985       s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
    986       s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
    987       s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
    988 
    989       d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
    990                              horiz_const, shift_round_0);
    991 
    992       if (w == 4) {
    993         vst1_s16(dst_ptr, d0);
    994         dst_ptr += im_dst_stride;
    995       } else if (w == 2) {
    996         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
    997         dst_ptr += im_dst_stride;
    998       }
    999 
   1000       src_ptr += src_stride;
   1001       height -= 1;
   1002 #endif
   1003     } while (height > 0);
   1004   } else {
   1005     int16_t *d_tmp;
   1006     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
   1007 #if defined(__aarch64__)
   1008     int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
   1009     int16x8_t s11, s12, s13, s14;
   1010 #endif
   1011 
   1012     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
   1013     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
   1014 
   1015 #if defined(__aarch64__)
   1016     do {
   1017       __builtin_prefetch(src_ptr + 0 * src_stride);
   1018       __builtin_prefetch(src_ptr + 1 * src_stride);
   1019       __builtin_prefetch(src_ptr + 2 * src_stride);
   1020       __builtin_prefetch(src_ptr + 3 * src_stride);
   1021       __builtin_prefetch(src_ptr + 4 * src_stride);
   1022       __builtin_prefetch(src_ptr + 5 * src_stride);
   1023       __builtin_prefetch(src_ptr + 6 * src_stride);
   1024       __builtin_prefetch(src_ptr + 7 * src_stride);
   1025 
   1026       load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1027 
   1028       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1029 
   1030       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1031       s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1032       s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1033       s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1034       s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1035       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1036       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1037 
   1038       width = w;
   1039       s = src_ptr + 7;
   1040       d_tmp = dst_ptr;
   1041 
   1042       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
   1043       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
   1044       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
   1045       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
   1046       __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
   1047       __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
   1048       __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
   1049       __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
   1050 
   1051       do {
   1052         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1053 
   1054         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1055 
   1056         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1057         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1058         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1059         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1060         s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1061         s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1062         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1063         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1064 
   1065         res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
   1066                                  horiz_const, shift_round_0);
   1067         res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
   1068                                  horiz_const, shift_round_0);
   1069         res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
   1070                                  horiz_const, shift_round_0);
   1071         res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
   1072                                  horiz_const, shift_round_0);
   1073         res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
   1074                                  horiz_const, shift_round_0);
   1075         res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
   1076                                  x_filter_tmp, horiz_const, shift_round_0);
   1077         res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
   1078                                  x_filter_tmp, horiz_const, shift_round_0);
   1079         res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
   1080                                  x_filter_tmp, horiz_const, shift_round_0);
   1081 
   1082         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
   1083                           &res7);
   1084 
   1085         store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
   1086                       res6, res7);
   1087 
   1088         s0 = s8;
   1089         s1 = s9;
   1090         s2 = s10;
   1091         s3 = s11;
   1092         s4 = s12;
   1093         s5 = s13;
   1094         s6 = s14;
   1095         s += 8;
   1096         d_tmp += 8;
   1097         width -= 8;
   1098       } while (width > 0);
   1099       src_ptr += 8 * src_stride;
   1100       dst_ptr += 8 * im_dst_stride;
   1101       height -= 8;
   1102     } while (height > 0);
   1103 #else
   1104     do {
   1105       t0 = vld1_u8(src_ptr);
   1106       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
   1107 
   1108       width = w;
   1109       s = src_ptr + 8;
   1110       d_tmp = dst_ptr;
   1111 
   1112       __builtin_prefetch(dst_ptr);
   1113 
   1114       do {
   1115         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
   1116         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1117         int16x8_t sum = s0;
   1118         s0 = s7;
   1119 
   1120         s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
   1121         s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
   1122         s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
   1123         s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
   1124         s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
   1125         s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
   1126         s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
   1127 
   1128         res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
   1129                                  horiz_const, shift_round_0);
   1130 
   1131         vst1q_s16(d_tmp, res0);
   1132 
   1133         s += 8;
   1134         d_tmp += 8;
   1135         width -= 8;
   1136       } while (width > 0);
   1137       src_ptr += src_stride;
   1138       dst_ptr += im_dst_stride;
   1139       height -= 1;
   1140     } while (height > 0);
   1141 #endif
   1142   }
   1143 
   1144   // vertical
   1145   {
   1146     uint8_t *dst_u8_ptr, *d_u8;
   1147     int16_t *v_src_ptr, *v_s;
   1148 
   1149     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
   1150                               (1 << (offset_bits - conv_params->round_1 - 1));
   1151     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
   1152         filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   1153 
   1154     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
   1155     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
   1156     const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
   1157 
   1158     src_stride = im_stride;
   1159     v_src_ptr = im_block;
   1160     dst_u8_ptr = dst;
   1161 
   1162     height = h;
   1163     width = w;
   1164 
   1165     if (width <= 4) {
   1166       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
   1167       uint16x4_t d0;
   1168       uint16x8_t dd0;
   1169       uint8x8_t d01;
   1170 
   1171 #if defined(__aarch64__)
   1172       int16x4_t s8, s9, s10;
   1173       uint16x4_t d1, d2, d3;
   1174       uint16x8_t dd1;
   1175       uint8x8_t d23;
   1176 #endif
   1177 
   1178       d_u8 = dst_u8_ptr;
   1179       v_s = v_src_ptr;
   1180 
   1181       __builtin_prefetch(v_s + 0 * im_stride);
   1182       __builtin_prefetch(v_s + 1 * im_stride);
   1183       __builtin_prefetch(v_s + 2 * im_stride);
   1184       __builtin_prefetch(v_s + 3 * im_stride);
   1185       __builtin_prefetch(v_s + 4 * im_stride);
   1186       __builtin_prefetch(v_s + 5 * im_stride);
   1187       __builtin_prefetch(v_s + 6 * im_stride);
   1188       __builtin_prefetch(v_s + 7 * im_stride);
   1189 
   1190       load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
   1191       v_s += (7 * im_stride);
   1192 
   1193       do {
   1194 #if defined(__aarch64__)
   1195         load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
   1196         v_s += (im_stride << 2);
   1197 
   1198         __builtin_prefetch(d_u8 + 0 * dst_stride);
   1199         __builtin_prefetch(d_u8 + 1 * dst_stride);
   1200         __builtin_prefetch(d_u8 + 2 * dst_stride);
   1201         __builtin_prefetch(d_u8 + 3 * dst_stride);
   1202 
   1203         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   1204                                     round_shift_vec, offset_const,
   1205                                     sub_const_vec);
   1206         d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   1207                                     round_shift_vec, offset_const,
   1208                                     sub_const_vec);
   1209         d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   1210                                     round_shift_vec, offset_const,
   1211                                     sub_const_vec);
   1212         d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   1213                                     round_shift_vec, offset_const,
   1214                                     sub_const_vec);
   1215 
   1216         dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
   1217         dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
   1218 
   1219         d01 = vqmovn_u16(dd0);
   1220         d23 = vqmovn_u16(dd1);
   1221 
   1222         if ((w == 4) && (h != 2)) {
   1223           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
   1224                         0);  // 00 01 02 03
   1225           d_u8 += dst_stride;
   1226           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
   1227                         1);  // 10 11 12 13
   1228           d_u8 += dst_stride;
   1229           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
   1230                         0);  // 20 21 22 23
   1231           d_u8 += dst_stride;
   1232           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
   1233                         1);  // 30 31 32 33
   1234           d_u8 += dst_stride;
   1235         } else if ((w == 2) && (h != 2)) {
   1236           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
   1237                         0);  // 00 01
   1238           d_u8 += dst_stride;
   1239           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
   1240                         2);  // 10 11
   1241           d_u8 += dst_stride;
   1242           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
   1243                         0);  // 20 21
   1244           d_u8 += dst_stride;
   1245           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
   1246                         2);  // 30 31
   1247           d_u8 += dst_stride;
   1248         } else if ((w == 4) && (h == 2)) {
   1249           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
   1250                         0);  // 00 01 02 03
   1251           d_u8 += dst_stride;
   1252           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
   1253                         1);  // 10 11 12 13
   1254           d_u8 += dst_stride;
   1255         } else if ((w == 2) && (h == 2)) {
   1256           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
   1257                         0);  // 00 01
   1258           d_u8 += dst_stride;
   1259           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
   1260                         2);  // 10 11
   1261           d_u8 += dst_stride;
   1262         }
   1263 
   1264         s0 = s4;
   1265         s1 = s5;
   1266         s2 = s6;
   1267         s3 = s7;
   1268         s4 = s8;
   1269         s5 = s9;
   1270         s6 = s10;
   1271         height -= 4;
   1272 #else
   1273         s7 = vld1_s16(v_s);
   1274         v_s += im_stride;
   1275 
   1276         __builtin_prefetch(d_u8 + 0 * dst_stride);
   1277 
   1278         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   1279                                     round_shift_vec, offset_const,
   1280                                     sub_const_vec);
   1281 
   1282         dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
   1283         d01 = vqmovn_u16(dd0);
   1284 
   1285         if (w == 4) {
   1286           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
   1287                         0);  // 00 01 02 03
   1288           d_u8 += dst_stride;
   1289 
   1290         } else if (w == 2) {
   1291           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
   1292                         0);  // 00 01
   1293           d_u8 += dst_stride;
   1294         }
   1295 
   1296         s0 = s1;
   1297         s1 = s2;
   1298         s2 = s3;
   1299         s3 = s4;
   1300         s4 = s5;
   1301         s5 = s6;
   1302         s6 = s7;
   1303         height -= 1;
   1304 #endif
   1305       } while (height > 0);
   1306     } else {
   1307       // if width is a multiple of 8 & height is a multiple of 4
   1308       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
   1309       uint8x8_t res0;
   1310 #if defined(__aarch64__)
   1311       int16x8_t s8, s9, s10;
   1312       uint8x8_t res1, res2, res3;
   1313 #endif
   1314 
   1315       do {
   1316         __builtin_prefetch(v_src_ptr + 0 * im_stride);
   1317         __builtin_prefetch(v_src_ptr + 1 * im_stride);
   1318         __builtin_prefetch(v_src_ptr + 2 * im_stride);
   1319         __builtin_prefetch(v_src_ptr + 3 * im_stride);
   1320         __builtin_prefetch(v_src_ptr + 4 * im_stride);
   1321         __builtin_prefetch(v_src_ptr + 5 * im_stride);
   1322         __builtin_prefetch(v_src_ptr + 6 * im_stride);
   1323         __builtin_prefetch(v_src_ptr + 7 * im_stride);
   1324 
   1325         v_s = v_src_ptr;
   1326         load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
   1327         v_s += (7 * im_stride);
   1328 
   1329         d_u8 = dst_u8_ptr;
   1330         height = h;
   1331 
   1332         do {
   1333 #if defined(__aarch64__)
   1334           load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
   1335           v_s += (im_stride << 2);
   1336 
   1337           __builtin_prefetch(d_u8 + 4 * dst_stride);
   1338           __builtin_prefetch(d_u8 + 5 * dst_stride);
   1339           __builtin_prefetch(d_u8 + 6 * dst_stride);
   1340           __builtin_prefetch(d_u8 + 7 * dst_stride);
   1341 
   1342           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
   1343                                         y_filter, round_shift_vec, offset_const,
   1344                                         sub_const_vec, vec_round_bits);
   1345           res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
   1346                                         y_filter, round_shift_vec, offset_const,
   1347                                         sub_const_vec, vec_round_bits);
   1348           res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
   1349                                         y_filter, round_shift_vec, offset_const,
   1350                                         sub_const_vec, vec_round_bits);
   1351           res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
   1352                                         y_filter, round_shift_vec, offset_const,
   1353                                         sub_const_vec, vec_round_bits);
   1354 
   1355           if (h != 2) {
   1356             vst1_u8(d_u8, res0);
   1357             d_u8 += dst_stride;
   1358             vst1_u8(d_u8, res1);
   1359             d_u8 += dst_stride;
   1360             vst1_u8(d_u8, res2);
   1361             d_u8 += dst_stride;
   1362             vst1_u8(d_u8, res3);
   1363             d_u8 += dst_stride;
   1364           } else {
   1365             vst1_u8(d_u8, res0);
   1366             d_u8 += dst_stride;
   1367             vst1_u8(d_u8, res1);
   1368             d_u8 += dst_stride;
   1369           }
   1370           s0 = s4;
   1371           s1 = s5;
   1372           s2 = s6;
   1373           s3 = s7;
   1374           s4 = s8;
   1375           s5 = s9;
   1376           s6 = s10;
   1377           height -= 4;
   1378 #else
   1379           s7 = vld1q_s16(v_s);
   1380           v_s += im_stride;
   1381 
   1382           __builtin_prefetch(d_u8 + 0 * dst_stride);
   1383 
   1384           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
   1385                                         y_filter, round_shift_vec, offset_const,
   1386                                         sub_const_vec, vec_round_bits);
   1387 
   1388           vst1_u8(d_u8, res0);
   1389           d_u8 += dst_stride;
   1390 
   1391           s0 = s1;
   1392           s1 = s2;
   1393           s2 = s3;
   1394           s3 = s4;
   1395           s4 = s5;
   1396           s5 = s6;
   1397           s6 = s7;
   1398           height -= 1;
   1399 #endif
   1400         } while (height > 0);
   1401         v_src_ptr += 8;
   1402         dst_u8_ptr += 8;
   1403         w -= 8;
   1404       } while (w > 0);
   1405     }
   1406   }
   1407 }
   1408 void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
   1409                                   uint8_t *dst, int dst_stride, int w, int h,
   1410                                   const InterpFilterParams *filter_params_x,
   1411                                   const InterpFilterParams *filter_params_y,
   1412                                   const int subpel_x_q4, const int subpel_y_q4,
   1413                                   ConvolveParams *conv_params) {
   1414   (void)filter_params_x;
   1415   (void)filter_params_y;
   1416   (void)subpel_x_q4;
   1417   (void)subpel_y_q4;
   1418   (void)conv_params;
   1419 
   1420   const uint8_t *src1;
   1421   uint8_t *dst1;
   1422   int y;
   1423 
   1424   if (!(w & 0x0F)) {
   1425     for (y = 0; y < h; ++y) {
   1426       src1 = src;
   1427       dst1 = dst;
   1428       for (int x = 0; x < (w >> 4); ++x) {
   1429         vst1q_u8(dst1, vld1q_u8(src1));
   1430         src1 += 16;
   1431         dst1 += 16;
   1432       }
   1433       src += src_stride;
   1434       dst += dst_stride;
   1435     }
   1436   } else if (!(w & 0x07)) {
   1437     for (y = 0; y < h; ++y) {
   1438       vst1_u8(dst, vld1_u8(src));
   1439       src += src_stride;
   1440       dst += dst_stride;
   1441     }
   1442   } else if (!(w & 0x03)) {
   1443     for (y = 0; y < h; ++y) {
   1444       vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
   1445       src += src_stride;
   1446       dst += dst_stride;
   1447     }
   1448   } else if (!(w & 0x01)) {
   1449     for (y = 0; y < h; ++y) {
   1450       vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
   1451       src += src_stride;
   1452       dst += dst_stride;
   1453     }
   1454   }
   1455 }
   1456