Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #ifdef _MSC_VER
     14 #define __builtin_prefetch(x)
     15 #endif
     16 
     17 static const int8_t vp8_sub_pel_filters[8][8] = {
     18     {0,  0,  128,   0,   0, 0, 0, 0},  /* note that 1/8 pel positionyys are */
     19     {0, -6,  123,  12,  -1, 0, 0, 0},  /*    just as per alpha -0.5 bicubic */
     20     {2, -11, 108,  36,  -8, 1, 0, 0},  /* New 1/4 pel 6 tap filter */
     21     {0, -9,   93,  50,  -6, 0, 0, 0},
     22     {3, -16,  77,  77, -16, 3, 0, 0},  /* New 1/2 pel 6 tap filter */
     23     {0, -6,   50,  93,  -9, 0, 0, 0},
     24     {1, -8,   36, 108, -11, 2, 0, 0},  /* New 1/4 pel 6 tap filter */
     25     {0, -1,   12, 123,  -6, 0, 0, 0},
     26 };
     27 
     28 void vp8_sixtap_predict4x4_neon(
     29         unsigned char *src_ptr,
     30         int src_pixels_per_line,
     31         int xoffset,
     32         int yoffset,
     33         unsigned char *dst_ptr,
     34         int dst_pitch) {
     35     unsigned char *src;
     36     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
     37     uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
     38     int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
     39     uint32x2_t d27u32, d28u32, d29u32, d30u32, d31u32;
     40     uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
     41     uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
     42     int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
     43     int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
     44     uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
     45     uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
     46     uint32x2x2_t d0u32x2, d1u32x2;
     47 
     48     if (xoffset == 0) {  // secondpass_filter4x4_only
     49         // load second_pass filter
     50         dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
     51         d0s8 = vdup_lane_s8(dtmps8, 0);
     52         d1s8 = vdup_lane_s8(dtmps8, 1);
     53         d2s8 = vdup_lane_s8(dtmps8, 2);
     54         d3s8 = vdup_lane_s8(dtmps8, 3);
     55         d4s8 = vdup_lane_s8(dtmps8, 4);
     56         d5s8 = vdup_lane_s8(dtmps8, 5);
     57         d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
     58         d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
     59         d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
     60         d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
     61         d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
     62         d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
     63 
     64         // load src data
     65         src = src_ptr - src_pixels_per_line * 2;
     66         d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
     67         src += src_pixels_per_line;
     68         d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
     69         src += src_pixels_per_line;
     70         d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
     71         src += src_pixels_per_line;
     72         d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
     73         src += src_pixels_per_line;
     74         d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
     75         src += src_pixels_per_line;
     76         d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
     77         src += src_pixels_per_line;
     78         d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
     79         src += src_pixels_per_line;
     80         d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
     81         src += src_pixels_per_line;
     82         d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
     83 
     84         d27u8 = vreinterpret_u8_u32(d27u32);
     85         d28u8 = vreinterpret_u8_u32(d28u32);
     86         d29u8 = vreinterpret_u8_u32(d29u32);
     87         d30u8 = vreinterpret_u8_u32(d30u32);
     88         d31u8 = vreinterpret_u8_u32(d31u32);
     89 
     90         d23u8 = vext_u8(d27u8, d28u8, 4);
     91         d24u8 = vext_u8(d28u8, d29u8, 4);
     92         d25u8 = vext_u8(d29u8, d30u8, 4);
     93         d26u8 = vext_u8(d30u8, d31u8, 4);
     94 
     95         q3u16 = vmull_u8(d27u8, d0u8);
     96         q4u16 = vmull_u8(d28u8, d0u8);
     97         q5u16 = vmull_u8(d25u8, d5u8);
     98         q6u16 = vmull_u8(d26u8, d5u8);
     99 
    100         q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
    101         q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
    102         q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
    103         q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
    104 
    105         q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
    106         q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
    107         q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
    108         q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
    109 
    110         q3s16 = vreinterpretq_s16_u16(q3u16);
    111         q4s16 = vreinterpretq_s16_u16(q4u16);
    112         q5s16 = vreinterpretq_s16_u16(q5u16);
    113         q6s16 = vreinterpretq_s16_u16(q6u16);
    114 
    115         q5s16 = vqaddq_s16(q5s16, q3s16);
    116         q6s16 = vqaddq_s16(q6s16, q4s16);
    117 
    118         d3u8 = vqrshrun_n_s16(q5s16, 7);
    119         d4u8 = vqrshrun_n_s16(q6s16, 7);
    120 
    121         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
    122         dst_ptr += dst_pitch;
    123         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
    124         dst_ptr += dst_pitch;
    125         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
    126         dst_ptr += dst_pitch;
    127         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
    128         return;
    129     }
    130 
    131     // load first_pass filter
    132     dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    133     d0s8 = vdup_lane_s8(dtmps8, 0);
    134     d1s8 = vdup_lane_s8(dtmps8, 1);
    135     d2s8 = vdup_lane_s8(dtmps8, 2);
    136     d3s8 = vdup_lane_s8(dtmps8, 3);
    137     d4s8 = vdup_lane_s8(dtmps8, 4);
    138     d5s8 = vdup_lane_s8(dtmps8, 5);
    139     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    140     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    141     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    142     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    143     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    144     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    145 
    146     // First pass: output_height lines x output_width columns (9x4)
    147 
    148     if (yoffset == 0)  // firstpass_filter4x4_only
    149         src = src_ptr - 2;
    150     else
    151         src = src_ptr - 2 - (src_pixels_per_line * 2);
    152 
    153     q3u8 = vld1q_u8(src);
    154     src += src_pixels_per_line;
    155     q4u8 = vld1q_u8(src);
    156     src += src_pixels_per_line;
    157     q5u8 = vld1q_u8(src);
    158     src += src_pixels_per_line;
    159     q6u8 = vld1q_u8(src);
    160     src += src_pixels_per_line;
    161 
    162     d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    163     d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    164     d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    165     d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    166 
    167     // vswp here
    168     q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
    169     q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
    170 
    171     d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
    172                        vreinterpret_u32_u8(d19u8));
    173     d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
    174                        vreinterpret_u32_u8(d21u8));
    175     q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
    176     q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
    177 
    178     // keep original src data in q4 q6
    179     q4u64 = vreinterpretq_u64_u8(q3u8);
    180     q6u64 = vreinterpretq_u64_u8(q5u8);
    181 
    182     d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
    183                        vreinterpret_u32_u8(vget_high_u8(q3u8)));
    184     d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
    185                        vreinterpret_u32_u8(vget_high_u8(q5u8)));
    186     q9u64 = vshrq_n_u64(q4u64, 8);
    187     q10u64 = vshrq_n_u64(q6u64, 8);
    188     q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
    189     q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
    190 
    191     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
    192                        vreinterpret_u32_u64(vget_high_u64(q9u64)));
    193     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
    194                        vreinterpret_u32_u64(vget_high_u64(q10u64)));
    195     q3u64 = vshrq_n_u64(q4u64, 32);
    196     q5u64 = vshrq_n_u64(q6u64, 32);
    197     q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
    198     q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
    199 
    200     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
    201                        vreinterpret_u32_u64(vget_high_u64(q3u64)));
    202     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
    203                        vreinterpret_u32_u64(vget_high_u64(q5u64)));
    204     q9u64 = vshrq_n_u64(q4u64, 16);
    205     q10u64 = vshrq_n_u64(q6u64, 16);
    206     q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
    207     q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
    208 
    209     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
    210                        vreinterpret_u32_u64(vget_high_u64(q9u64)));
    211     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
    212                        vreinterpret_u32_u64(vget_high_u64(q10u64)));
    213     q3u64 = vshrq_n_u64(q4u64, 24);
    214     q5u64 = vshrq_n_u64(q6u64, 24);
    215     q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
    216     q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
    217 
    218     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
    219                        vreinterpret_u32_u64(vget_high_u64(q3u64)));
    220     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
    221                        vreinterpret_u32_u64(vget_high_u64(q5u64)));
    222     q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
    223     q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
    224 
    225     q7s16 = vreinterpretq_s16_u16(q7u16);
    226     q8s16 = vreinterpretq_s16_u16(q8u16);
    227     q9s16 = vreinterpretq_s16_u16(q9u16);
    228     q10s16 = vreinterpretq_s16_u16(q10u16);
    229     q7s16 = vqaddq_s16(q7s16, q9s16);
    230     q8s16 = vqaddq_s16(q8s16, q10s16);
    231 
    232     d27u8 = vqrshrun_n_s16(q7s16, 7);
    233     d28u8 = vqrshrun_n_s16(q8s16, 7);
    234 
    235     if (yoffset == 0) {  // firstpass_filter4x4_only
    236         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
    237         dst_ptr += dst_pitch;
    238         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
    239         dst_ptr += dst_pitch;
    240         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
    241         dst_ptr += dst_pitch;
    242         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
    243         return;
    244     }
    245 
    246     // First Pass on rest 5-line data
    247     q3u8 = vld1q_u8(src);
    248     src += src_pixels_per_line;
    249     q4u8 = vld1q_u8(src);
    250     src += src_pixels_per_line;
    251     q5u8 = vld1q_u8(src);
    252     src += src_pixels_per_line;
    253     q6u8 = vld1q_u8(src);
    254     src += src_pixels_per_line;
    255     q11u8 = vld1q_u8(src);
    256 
    257     d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    258     d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    259     d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    260     d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    261 
    262     // vswp here
    263     q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
    264     q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
    265 
    266     d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
    267                        vreinterpret_u32_u8(d19u8));
    268     d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
    269                        vreinterpret_u32_u8(d21u8));
    270     d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
    271     q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
    272     q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
    273     q12u16 = vmull_u8(d31u8, d5u8);
    274 
    275     q4u64 = vreinterpretq_u64_u8(q3u8);
    276     q6u64 = vreinterpretq_u64_u8(q5u8);
    277 
    278     d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
    279                        vreinterpret_u32_u8(vget_high_u8(q3u8)));
    280     d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
    281                        vreinterpret_u32_u8(vget_high_u8(q5u8)));
    282     q9u64 = vshrq_n_u64(q4u64, 8);
    283     q10u64 = vshrq_n_u64(q6u64, 8);
    284     q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
    285     q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
    286     q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
    287 
    288     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
    289                        vreinterpret_u32_u64(vget_high_u64(q9u64)));
    290     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
    291                        vreinterpret_u32_u64(vget_high_u64(q10u64)));
    292     q3u64 = vshrq_n_u64(q4u64, 32);
    293     q5u64 = vshrq_n_u64(q6u64, 32);
    294     d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
    295     q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
    296     q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
    297     q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
    298 
    299     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
    300                        vreinterpret_u32_u64(vget_high_u64(q3u64)));
    301     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
    302                        vreinterpret_u32_u64(vget_high_u64(q5u64)));
    303     q9u64 = vshrq_n_u64(q4u64, 16);
    304     q10u64 = vshrq_n_u64(q6u64, 16);
    305     d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
    306     q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
    307     q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
    308     q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
    309 
    310     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
    311                        vreinterpret_u32_u64(vget_high_u64(q9u64)));
    312     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
    313                        vreinterpret_u32_u64(vget_high_u64(q10u64)));
    314     q3u64 = vshrq_n_u64(q4u64, 24);
    315     q5u64 = vshrq_n_u64(q6u64, 24);
    316     d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
    317     q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
    318     q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
    319     q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
    320 
    321     d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
    322                        vreinterpret_u32_u64(vget_high_u64(q3u64)));
    323     d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
    324                        vreinterpret_u32_u64(vget_high_u64(q5u64)));
    325     d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
    326     q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
    327     q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
    328     q11u16 = vmull_u8(d31u8, d3u8);
    329 
    330     q7s16 = vreinterpretq_s16_u16(q7u16);
    331     q8s16 = vreinterpretq_s16_u16(q8u16);
    332     q9s16 = vreinterpretq_s16_u16(q9u16);
    333     q10s16 = vreinterpretq_s16_u16(q10u16);
    334     q11s16 = vreinterpretq_s16_u16(q11u16);
    335     q12s16 = vreinterpretq_s16_u16(q12u16);
    336     q7s16 = vqaddq_s16(q7s16, q9s16);
    337     q8s16 = vqaddq_s16(q8s16, q10s16);
    338     q12s16 = vqaddq_s16(q12s16, q11s16);
    339 
    340     d29u8 = vqrshrun_n_s16(q7s16, 7);
    341     d30u8 = vqrshrun_n_s16(q8s16, 7);
    342     d31u8 = vqrshrun_n_s16(q12s16, 7);
    343 
    344     // Second pass: 4x4
    345     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    346     d0s8 = vdup_lane_s8(dtmps8, 0);
    347     d1s8 = vdup_lane_s8(dtmps8, 1);
    348     d2s8 = vdup_lane_s8(dtmps8, 2);
    349     d3s8 = vdup_lane_s8(dtmps8, 3);
    350     d4s8 = vdup_lane_s8(dtmps8, 4);
    351     d5s8 = vdup_lane_s8(dtmps8, 5);
    352     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    353     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    354     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    355     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    356     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    357     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    358 
    359     d23u8 = vext_u8(d27u8, d28u8, 4);
    360     d24u8 = vext_u8(d28u8, d29u8, 4);
    361     d25u8 = vext_u8(d29u8, d30u8, 4);
    362     d26u8 = vext_u8(d30u8, d31u8, 4);
    363 
    364     q3u16 = vmull_u8(d27u8, d0u8);
    365     q4u16 = vmull_u8(d28u8, d0u8);
    366     q5u16 = vmull_u8(d25u8, d5u8);
    367     q6u16 = vmull_u8(d26u8, d5u8);
    368 
    369     q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
    370     q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
    371     q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
    372     q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
    373 
    374     q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
    375     q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
    376     q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
    377     q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
    378 
    379     q3s16 = vreinterpretq_s16_u16(q3u16);
    380     q4s16 = vreinterpretq_s16_u16(q4u16);
    381     q5s16 = vreinterpretq_s16_u16(q5u16);
    382     q6s16 = vreinterpretq_s16_u16(q6u16);
    383 
    384     q5s16 = vqaddq_s16(q5s16, q3s16);
    385     q6s16 = vqaddq_s16(q6s16, q4s16);
    386 
    387     d3u8 = vqrshrun_n_s16(q5s16, 7);
    388     d4u8 = vqrshrun_n_s16(q6s16, 7);
    389 
    390     vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
    391     dst_ptr += dst_pitch;
    392     vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
    393     dst_ptr += dst_pitch;
    394     vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
    395     dst_ptr += dst_pitch;
    396     vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
    397     return;
    398 }
    399 
    400 void vp8_sixtap_predict8x4_neon(
    401         unsigned char *src_ptr,
    402         int src_pixels_per_line,
    403         int xoffset,
    404         int yoffset,
    405         unsigned char *dst_ptr,
    406         int dst_pitch) {
    407     unsigned char *src;
    408     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    409     uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
    410     uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
    411     int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    412     uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
    413     uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
    414     int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
    415     int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
    416     uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
    417 
    418     if (xoffset == 0) {  // secondpass_filter8x4_only
    419         // load second_pass filter
    420         dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    421         d0s8 = vdup_lane_s8(dtmps8, 0);
    422         d1s8 = vdup_lane_s8(dtmps8, 1);
    423         d2s8 = vdup_lane_s8(dtmps8, 2);
    424         d3s8 = vdup_lane_s8(dtmps8, 3);
    425         d4s8 = vdup_lane_s8(dtmps8, 4);
    426         d5s8 = vdup_lane_s8(dtmps8, 5);
    427         d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    428         d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    429         d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    430         d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    431         d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    432         d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    433 
    434         // load src data
    435         src = src_ptr - src_pixels_per_line * 2;
    436         d22u8 = vld1_u8(src);
    437         src += src_pixels_per_line;
    438         d23u8 = vld1_u8(src);
    439         src += src_pixels_per_line;
    440         d24u8 = vld1_u8(src);
    441         src += src_pixels_per_line;
    442         d25u8 = vld1_u8(src);
    443         src += src_pixels_per_line;
    444         d26u8 = vld1_u8(src);
    445         src += src_pixels_per_line;
    446         d27u8 = vld1_u8(src);
    447         src += src_pixels_per_line;
    448         d28u8 = vld1_u8(src);
    449         src += src_pixels_per_line;
    450         d29u8 = vld1_u8(src);
    451         src += src_pixels_per_line;
    452         d30u8 = vld1_u8(src);
    453 
    454         q3u16 = vmull_u8(d22u8, d0u8);
    455         q4u16 = vmull_u8(d23u8, d0u8);
    456         q5u16 = vmull_u8(d24u8, d0u8);
    457         q6u16 = vmull_u8(d25u8, d0u8);
    458 
    459         q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
    460         q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
    461         q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
    462         q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
    463 
    464         q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
    465         q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
    466         q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
    467         q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
    468 
    469         q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
    470         q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
    471         q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
    472         q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
    473 
    474         q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
    475         q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
    476         q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
    477         q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
    478 
    479         q7u16 = vmull_u8(d25u8, d3u8);
    480         q8u16 = vmull_u8(d26u8, d3u8);
    481         q9u16 = vmull_u8(d27u8, d3u8);
    482         q10u16 = vmull_u8(d28u8, d3u8);
    483 
    484         q3s16 = vreinterpretq_s16_u16(q3u16);
    485         q4s16 = vreinterpretq_s16_u16(q4u16);
    486         q5s16 = vreinterpretq_s16_u16(q5u16);
    487         q6s16 = vreinterpretq_s16_u16(q6u16);
    488         q7s16 = vreinterpretq_s16_u16(q7u16);
    489         q8s16 = vreinterpretq_s16_u16(q8u16);
    490         q9s16 = vreinterpretq_s16_u16(q9u16);
    491         q10s16 = vreinterpretq_s16_u16(q10u16);
    492 
    493         q7s16 = vqaddq_s16(q7s16, q3s16);
    494         q8s16 = vqaddq_s16(q8s16, q4s16);
    495         q9s16 = vqaddq_s16(q9s16, q5s16);
    496         q10s16 = vqaddq_s16(q10s16, q6s16);
    497 
    498         d6u8 = vqrshrun_n_s16(q7s16, 7);
    499         d7u8 = vqrshrun_n_s16(q8s16, 7);
    500         d8u8 = vqrshrun_n_s16(q9s16, 7);
    501         d9u8 = vqrshrun_n_s16(q10s16, 7);
    502 
    503         vst1_u8(dst_ptr, d6u8);
    504         dst_ptr += dst_pitch;
    505         vst1_u8(dst_ptr, d7u8);
    506         dst_ptr += dst_pitch;
    507         vst1_u8(dst_ptr, d8u8);
    508         dst_ptr += dst_pitch;
    509         vst1_u8(dst_ptr, d9u8);
    510         return;
    511     }
    512 
    513     // load first_pass filter
    514     dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    515     d0s8 = vdup_lane_s8(dtmps8, 0);
    516     d1s8 = vdup_lane_s8(dtmps8, 1);
    517     d2s8 = vdup_lane_s8(dtmps8, 2);
    518     d3s8 = vdup_lane_s8(dtmps8, 3);
    519     d4s8 = vdup_lane_s8(dtmps8, 4);
    520     d5s8 = vdup_lane_s8(dtmps8, 5);
    521     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    522     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    523     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    524     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    525     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    526     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    527 
    528     // First pass: output_height lines x output_width columns (9x4)
    529     if (yoffset == 0)  // firstpass_filter4x4_only
    530         src = src_ptr - 2;
    531     else
    532         src = src_ptr - 2 - (src_pixels_per_line * 2);
    533     q3u8 = vld1q_u8(src);
    534     src += src_pixels_per_line;
    535     q4u8 = vld1q_u8(src);
    536     src += src_pixels_per_line;
    537     q5u8 = vld1q_u8(src);
    538     src += src_pixels_per_line;
    539     q6u8 = vld1q_u8(src);
    540 
    541     q7u16  = vmull_u8(vget_low_u8(q3u8), d0u8);
    542     q8u16  = vmull_u8(vget_low_u8(q4u8), d0u8);
    543     q9u16  = vmull_u8(vget_low_u8(q5u8), d0u8);
    544     q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    545 
    546     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    547     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    548     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    549     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    550 
    551     q7u16  = vmlsl_u8(q7u16, d28u8, d1u8);
    552     q8u16  = vmlsl_u8(q8u16, d29u8, d1u8);
    553     q9u16  = vmlsl_u8(q9u16, d30u8, d1u8);
    554     q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
    555 
    556     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    557     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    558     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    559     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    560 
    561     q7u16  = vmlsl_u8(q7u16, d28u8, d4u8);
    562     q8u16  = vmlsl_u8(q8u16, d29u8, d4u8);
    563     q9u16  = vmlsl_u8(q9u16, d30u8, d4u8);
    564     q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
    565 
    566     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    567     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    568     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    569     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    570 
    571     q7u16  = vmlal_u8(q7u16, d28u8, d2u8);
    572     q8u16  = vmlal_u8(q8u16, d29u8, d2u8);
    573     q9u16  = vmlal_u8(q9u16, d30u8, d2u8);
    574     q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
    575 
    576     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    577     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    578     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    579     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    580 
    581     q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
    582     q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
    583     q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
    584     q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
    585 
    586     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
    587     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
    588     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
    589     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
    590 
    591     q3u16 = vmull_u8(d28u8, d3u8);
    592     q4u16 = vmull_u8(d29u8, d3u8);
    593     q5u16 = vmull_u8(d30u8, d3u8);
    594     q6u16 = vmull_u8(d31u8, d3u8);
    595 
    596     q3s16 = vreinterpretq_s16_u16(q3u16);
    597     q4s16 = vreinterpretq_s16_u16(q4u16);
    598     q5s16 = vreinterpretq_s16_u16(q5u16);
    599     q6s16 = vreinterpretq_s16_u16(q6u16);
    600     q7s16 = vreinterpretq_s16_u16(q7u16);
    601     q8s16 = vreinterpretq_s16_u16(q8u16);
    602     q9s16 = vreinterpretq_s16_u16(q9u16);
    603     q10s16 = vreinterpretq_s16_u16(q10u16);
    604 
    605     q7s16 = vqaddq_s16(q7s16, q3s16);
    606     q8s16 = vqaddq_s16(q8s16, q4s16);
    607     q9s16 = vqaddq_s16(q9s16, q5s16);
    608     q10s16 = vqaddq_s16(q10s16, q6s16);
    609 
    610     d22u8 = vqrshrun_n_s16(q7s16, 7);
    611     d23u8 = vqrshrun_n_s16(q8s16, 7);
    612     d24u8 = vqrshrun_n_s16(q9s16, 7);
    613     d25u8 = vqrshrun_n_s16(q10s16, 7);
    614 
    615     if (yoffset == 0) {  // firstpass_filter8x4_only
    616         vst1_u8(dst_ptr, d22u8);
    617         dst_ptr += dst_pitch;
    618         vst1_u8(dst_ptr, d23u8);
    619         dst_ptr += dst_pitch;
    620         vst1_u8(dst_ptr, d24u8);
    621         dst_ptr += dst_pitch;
    622         vst1_u8(dst_ptr, d25u8);
    623         return;
    624     }
    625 
    626     // First Pass on rest 5-line data
    627     src += src_pixels_per_line;
    628     q3u8 = vld1q_u8(src);
    629     src += src_pixels_per_line;
    630     q4u8 = vld1q_u8(src);
    631     src += src_pixels_per_line;
    632     q5u8 = vld1q_u8(src);
    633     src += src_pixels_per_line;
    634     q6u8 = vld1q_u8(src);
    635     src += src_pixels_per_line;
    636     q7u8 = vld1q_u8(src);
    637 
    638     q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    639     q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    640     q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    641     q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    642     q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
    643 
    644     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    645     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    646     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    647     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    648     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
    649 
    650     q8u16  = vmlsl_u8(q8u16, d27u8, d1u8);
    651     q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
    652     q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
    653     q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
    654     q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
    655 
    656     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    657     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    658     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    659     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    660     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
    661 
    662     q8u16  = vmlsl_u8(q8u16, d27u8, d4u8);
    663     q9u16  = vmlsl_u8(q9u16, d28u8, d4u8);
    664     q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
    665     q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
    666     q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
    667 
    668     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    669     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    670     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    671     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    672     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
    673 
    674     q8u16  = vmlal_u8(q8u16, d27u8, d2u8);
    675     q9u16  = vmlal_u8(q9u16, d28u8, d2u8);
    676     q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
    677     q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
    678     q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
    679 
    680     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    681     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    682     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    683     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    684     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
    685 
    686     q8u16  = vmlal_u8(q8u16, d27u8, d5u8);
    687     q9u16  = vmlal_u8(q9u16, d28u8, d5u8);
    688     q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
    689     q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
    690     q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
    691 
    692     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
    693     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
    694     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
    695     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
    696     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
    697 
    698     q3u16 = vmull_u8(d27u8, d3u8);
    699     q4u16 = vmull_u8(d28u8, d3u8);
    700     q5u16 = vmull_u8(d29u8, d3u8);
    701     q6u16 = vmull_u8(d30u8, d3u8);
    702     q7u16 = vmull_u8(d31u8, d3u8);
    703 
    704     q3s16 = vreinterpretq_s16_u16(q3u16);
    705     q4s16 = vreinterpretq_s16_u16(q4u16);
    706     q5s16 = vreinterpretq_s16_u16(q5u16);
    707     q6s16 = vreinterpretq_s16_u16(q6u16);
    708     q7s16 = vreinterpretq_s16_u16(q7u16);
    709     q8s16 = vreinterpretq_s16_u16(q8u16);
    710     q9s16 = vreinterpretq_s16_u16(q9u16);
    711     q10s16 = vreinterpretq_s16_u16(q10u16);
    712     q11s16 = vreinterpretq_s16_u16(q11u16);
    713     q12s16 = vreinterpretq_s16_u16(q12u16);
    714 
    715     q8s16 = vqaddq_s16(q8s16, q3s16);
    716     q9s16 = vqaddq_s16(q9s16, q4s16);
    717     q10s16 = vqaddq_s16(q10s16, q5s16);
    718     q11s16 = vqaddq_s16(q11s16, q6s16);
    719     q12s16 = vqaddq_s16(q12s16, q7s16);
    720 
    721     d26u8 = vqrshrun_n_s16(q8s16, 7);
    722     d27u8 = vqrshrun_n_s16(q9s16, 7);
    723     d28u8 = vqrshrun_n_s16(q10s16, 7);
    724     d29u8 = vqrshrun_n_s16(q11s16, 7);
    725     d30u8 = vqrshrun_n_s16(q12s16, 7);
    726 
    727     // Second pass: 8x4
    728     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    729     d0s8 = vdup_lane_s8(dtmps8, 0);
    730     d1s8 = vdup_lane_s8(dtmps8, 1);
    731     d2s8 = vdup_lane_s8(dtmps8, 2);
    732     d3s8 = vdup_lane_s8(dtmps8, 3);
    733     d4s8 = vdup_lane_s8(dtmps8, 4);
    734     d5s8 = vdup_lane_s8(dtmps8, 5);
    735     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    736     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    737     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    738     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    739     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    740     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    741 
    742     q3u16 = vmull_u8(d22u8, d0u8);
    743     q4u16 = vmull_u8(d23u8, d0u8);
    744     q5u16 = vmull_u8(d24u8, d0u8);
    745     q6u16 = vmull_u8(d25u8, d0u8);
    746 
    747     q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
    748     q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
    749     q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
    750     q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
    751 
    752     q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
    753     q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
    754     q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
    755     q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
    756 
    757     q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
    758     q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
    759     q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
    760     q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
    761 
    762     q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
    763     q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
    764     q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
    765     q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
    766 
    767     q7u16 = vmull_u8(d25u8, d3u8);
    768     q8u16 = vmull_u8(d26u8, d3u8);
    769     q9u16 = vmull_u8(d27u8, d3u8);
    770     q10u16 = vmull_u8(d28u8, d3u8);
    771 
    772     q3s16 = vreinterpretq_s16_u16(q3u16);
    773     q4s16 = vreinterpretq_s16_u16(q4u16);
    774     q5s16 = vreinterpretq_s16_u16(q5u16);
    775     q6s16 = vreinterpretq_s16_u16(q6u16);
    776     q7s16 = vreinterpretq_s16_u16(q7u16);
    777     q8s16 = vreinterpretq_s16_u16(q8u16);
    778     q9s16 = vreinterpretq_s16_u16(q9u16);
    779     q10s16 = vreinterpretq_s16_u16(q10u16);
    780 
    781     q7s16 = vqaddq_s16(q7s16, q3s16);
    782     q8s16 = vqaddq_s16(q8s16, q4s16);
    783     q9s16 = vqaddq_s16(q9s16, q5s16);
    784     q10s16 = vqaddq_s16(q10s16, q6s16);
    785 
    786     d6u8 = vqrshrun_n_s16(q7s16, 7);
    787     d7u8 = vqrshrun_n_s16(q8s16, 7);
    788     d8u8 = vqrshrun_n_s16(q9s16, 7);
    789     d9u8 = vqrshrun_n_s16(q10s16, 7);
    790 
    791     vst1_u8(dst_ptr, d6u8);
    792     dst_ptr += dst_pitch;
    793     vst1_u8(dst_ptr, d7u8);
    794     dst_ptr += dst_pitch;
    795     vst1_u8(dst_ptr, d8u8);
    796     dst_ptr += dst_pitch;
    797     vst1_u8(dst_ptr, d9u8);
    798     return;
    799 }
    800 
    801 void vp8_sixtap_predict8x8_neon(
    802         unsigned char *src_ptr,
    803         int src_pixels_per_line,
    804         int xoffset,
    805         int yoffset,
    806         unsigned char *dst_ptr,
    807         int dst_pitch) {
    808     unsigned char *src, *tmpp;
    809     unsigned char tmp[64];
    810     int i;
    811     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    812     uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
    813     uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
    814     int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    815     uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
    816     uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
    817     int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
    818     int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
    819     uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
    820 
    821     if (xoffset == 0) {  // secondpass_filter8x8_only
    822         // load second_pass filter
    823         dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    824         d0s8 = vdup_lane_s8(dtmps8, 0);
    825         d1s8 = vdup_lane_s8(dtmps8, 1);
    826         d2s8 = vdup_lane_s8(dtmps8, 2);
    827         d3s8 = vdup_lane_s8(dtmps8, 3);
    828         d4s8 = vdup_lane_s8(dtmps8, 4);
    829         d5s8 = vdup_lane_s8(dtmps8, 5);
    830         d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    831         d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    832         d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    833         d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    834         d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    835         d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    836 
    837         // load src data
    838         src = src_ptr - src_pixels_per_line * 2;
    839         d18u8 = vld1_u8(src);
    840         src += src_pixels_per_line;
    841         d19u8 = vld1_u8(src);
    842         src += src_pixels_per_line;
    843         d20u8 = vld1_u8(src);
    844         src += src_pixels_per_line;
    845         d21u8 = vld1_u8(src);
    846         src += src_pixels_per_line;
    847         d22u8 = vld1_u8(src);
    848         src += src_pixels_per_line;
    849         d23u8 = vld1_u8(src);
    850         src += src_pixels_per_line;
    851         d24u8 = vld1_u8(src);
    852         src += src_pixels_per_line;
    853         d25u8 = vld1_u8(src);
    854         src += src_pixels_per_line;
    855         d26u8 = vld1_u8(src);
    856         src += src_pixels_per_line;
    857         d27u8 = vld1_u8(src);
    858         src += src_pixels_per_line;
    859         d28u8 = vld1_u8(src);
    860         src += src_pixels_per_line;
    861         d29u8 = vld1_u8(src);
    862         src += src_pixels_per_line;
    863         d30u8 = vld1_u8(src);
    864 
    865         for (i = 2; i > 0; i--) {
    866             q3u16 = vmull_u8(d18u8, d0u8);
    867             q4u16 = vmull_u8(d19u8, d0u8);
    868             q5u16 = vmull_u8(d20u8, d0u8);
    869             q6u16 = vmull_u8(d21u8, d0u8);
    870 
    871             q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
    872             q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
    873             q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
    874             q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
    875 
    876             q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
    877             q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
    878             q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
    879             q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
    880 
    881             q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
    882             q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
    883             q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
    884             q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
    885 
    886             q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
    887             q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
    888             q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
    889             q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
    890 
    891             q7u16 = vmull_u8(d21u8, d3u8);
    892             q8u16 = vmull_u8(d22u8, d3u8);
    893             q9u16 = vmull_u8(d23u8, d3u8);
    894             q10u16 = vmull_u8(d24u8, d3u8);
    895 
    896             q3s16 = vreinterpretq_s16_u16(q3u16);
    897             q4s16 = vreinterpretq_s16_u16(q4u16);
    898             q5s16 = vreinterpretq_s16_u16(q5u16);
    899             q6s16 = vreinterpretq_s16_u16(q6u16);
    900             q7s16 = vreinterpretq_s16_u16(q7u16);
    901             q8s16 = vreinterpretq_s16_u16(q8u16);
    902             q9s16 = vreinterpretq_s16_u16(q9u16);
    903             q10s16 = vreinterpretq_s16_u16(q10u16);
    904 
    905             q7s16 = vqaddq_s16(q7s16, q3s16);
    906             q8s16 = vqaddq_s16(q8s16, q4s16);
    907             q9s16 = vqaddq_s16(q9s16, q5s16);
    908             q10s16 = vqaddq_s16(q10s16, q6s16);
    909 
    910             d6u8 = vqrshrun_n_s16(q7s16, 7);
    911             d7u8 = vqrshrun_n_s16(q8s16, 7);
    912             d8u8 = vqrshrun_n_s16(q9s16, 7);
    913             d9u8 = vqrshrun_n_s16(q10s16, 7);
    914 
    915             d18u8 = d22u8;
    916             d19u8 = d23u8;
    917             d20u8 = d24u8;
    918             d21u8 = d25u8;
    919             d22u8 = d26u8;
    920             d23u8 = d27u8;
    921             d24u8 = d28u8;
    922             d25u8 = d29u8;
    923             d26u8 = d30u8;
    924 
    925             vst1_u8(dst_ptr, d6u8);
    926             dst_ptr += dst_pitch;
    927             vst1_u8(dst_ptr, d7u8);
    928             dst_ptr += dst_pitch;
    929             vst1_u8(dst_ptr, d8u8);
    930             dst_ptr += dst_pitch;
    931             vst1_u8(dst_ptr, d9u8);
    932             dst_ptr += dst_pitch;
    933         }
    934         return;
    935     }
    936 
    937     // load first_pass filter
    938     dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    939     d0s8 = vdup_lane_s8(dtmps8, 0);
    940     d1s8 = vdup_lane_s8(dtmps8, 1);
    941     d2s8 = vdup_lane_s8(dtmps8, 2);
    942     d3s8 = vdup_lane_s8(dtmps8, 3);
    943     d4s8 = vdup_lane_s8(dtmps8, 4);
    944     d5s8 = vdup_lane_s8(dtmps8, 5);
    945     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    946     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    947     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    948     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    949     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    950     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    951 
    952     // First pass: output_height lines x output_width columns (9x4)
    953     if (yoffset == 0)  // firstpass_filter4x4_only
    954         src = src_ptr - 2;
    955     else
    956         src = src_ptr - 2 - (src_pixels_per_line * 2);
    957 
    958     tmpp = tmp;
    959     for (i = 2; i > 0; i--) {
    960         q3u8 = vld1q_u8(src);
    961         src += src_pixels_per_line;
    962         q4u8 = vld1q_u8(src);
    963         src += src_pixels_per_line;
    964         q5u8 = vld1q_u8(src);
    965         src += src_pixels_per_line;
    966         q6u8 = vld1q_u8(src);
    967         src += src_pixels_per_line;
    968 
    969         __builtin_prefetch(src);
    970         __builtin_prefetch(src + src_pixels_per_line);
    971         __builtin_prefetch(src + src_pixels_per_line * 2);
    972 
    973         q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    974         q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    975         q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    976         q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    977 
    978         d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    979         d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    980         d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    981         d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    982 
    983         q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
    984         q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
    985         q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
    986         q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
    987 
    988         d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    989         d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    990         d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    991         d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    992 
    993         q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
    994         q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
    995         q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
    996         q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
    997 
    998         d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    999         d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
   1000         d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
   1001         d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
   1002 
   1003         q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
   1004         q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
   1005         q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
   1006         q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
   1007 
   1008         d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
   1009         d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
   1010         d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
   1011         d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
   1012 
   1013         q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
   1014         q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
   1015         q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
   1016         q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
   1017 
   1018         d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
   1019         d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
   1020         d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
   1021         d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
   1022 
   1023         q3u16 = vmull_u8(d28u8, d3u8);
   1024         q4u16 = vmull_u8(d29u8, d3u8);
   1025         q5u16 = vmull_u8(d30u8, d3u8);
   1026         q6u16 = vmull_u8(d31u8, d3u8);
   1027 
   1028         q3s16 = vreinterpretq_s16_u16(q3u16);
   1029         q4s16 = vreinterpretq_s16_u16(q4u16);
   1030         q5s16 = vreinterpretq_s16_u16(q5u16);
   1031         q6s16 = vreinterpretq_s16_u16(q6u16);
   1032         q7s16 = vreinterpretq_s16_u16(q7u16);
   1033         q8s16 = vreinterpretq_s16_u16(q8u16);
   1034         q9s16 = vreinterpretq_s16_u16(q9u16);
   1035         q10s16 = vreinterpretq_s16_u16(q10u16);
   1036 
   1037         q7s16 = vqaddq_s16(q7s16, q3s16);
   1038         q8s16 = vqaddq_s16(q8s16, q4s16);
   1039         q9s16 = vqaddq_s16(q9s16, q5s16);
   1040         q10s16 = vqaddq_s16(q10s16, q6s16);
   1041 
   1042         d22u8 = vqrshrun_n_s16(q7s16, 7);
   1043         d23u8 = vqrshrun_n_s16(q8s16, 7);
   1044         d24u8 = vqrshrun_n_s16(q9s16, 7);
   1045         d25u8 = vqrshrun_n_s16(q10s16, 7);
   1046 
   1047         if (yoffset == 0) {  // firstpass_filter8x4_only
   1048             vst1_u8(dst_ptr, d22u8);
   1049             dst_ptr += dst_pitch;
   1050             vst1_u8(dst_ptr, d23u8);
   1051             dst_ptr += dst_pitch;
   1052             vst1_u8(dst_ptr, d24u8);
   1053             dst_ptr += dst_pitch;
   1054             vst1_u8(dst_ptr, d25u8);
   1055             dst_ptr += dst_pitch;
   1056         } else {
   1057             vst1_u8(tmpp, d22u8);
   1058             tmpp += 8;
   1059             vst1_u8(tmpp, d23u8);
   1060             tmpp += 8;
   1061             vst1_u8(tmpp, d24u8);
   1062             tmpp += 8;
   1063             vst1_u8(tmpp, d25u8);
   1064             tmpp += 8;
   1065         }
   1066     }
   1067     if (yoffset == 0)
   1068         return;
   1069 
   1070     // First Pass on rest 5-line data
   1071     q3u8 = vld1q_u8(src);
   1072     src += src_pixels_per_line;
   1073     q4u8 = vld1q_u8(src);
   1074     src += src_pixels_per_line;
   1075     q5u8 = vld1q_u8(src);
   1076     src += src_pixels_per_line;
   1077     q6u8 = vld1q_u8(src);
   1078     src += src_pixels_per_line;
   1079     q7u8 = vld1q_u8(src);
   1080 
   1081     q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
   1082     q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
   1083     q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
   1084     q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
   1085     q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
   1086 
   1087     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
   1088     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
   1089     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
   1090     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
   1091     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
   1092 
   1093     q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
   1094     q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
   1095     q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
   1096     q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
   1097     q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
   1098 
   1099     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
   1100     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
   1101     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
   1102     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
   1103     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
   1104 
   1105     q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
   1106     q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
   1107     q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
   1108     q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
   1109     q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
   1110 
   1111     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
   1112     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
   1113     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
   1114     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
   1115     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
   1116 
   1117     q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
   1118     q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
   1119     q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
   1120     q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
   1121     q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
   1122 
   1123     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
   1124     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
   1125     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
   1126     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
   1127     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
   1128 
   1129     q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
   1130     q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
   1131     q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
   1132     q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
   1133     q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
   1134 
   1135     d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
   1136     d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
   1137     d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
   1138     d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
   1139     d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
   1140 
   1141     q3u16 = vmull_u8(d27u8, d3u8);
   1142     q4u16 = vmull_u8(d28u8, d3u8);
   1143     q5u16 = vmull_u8(d29u8, d3u8);
   1144     q6u16 = vmull_u8(d30u8, d3u8);
   1145     q7u16 = vmull_u8(d31u8, d3u8);
   1146 
   1147     q3s16 = vreinterpretq_s16_u16(q3u16);
   1148     q4s16 = vreinterpretq_s16_u16(q4u16);
   1149     q5s16 = vreinterpretq_s16_u16(q5u16);
   1150     q6s16 = vreinterpretq_s16_u16(q6u16);
   1151     q7s16 = vreinterpretq_s16_u16(q7u16);
   1152     q8s16 = vreinterpretq_s16_u16(q8u16);
   1153     q9s16 = vreinterpretq_s16_u16(q9u16);
   1154     q10s16 = vreinterpretq_s16_u16(q10u16);
   1155     q11s16 = vreinterpretq_s16_u16(q11u16);
   1156     q12s16 = vreinterpretq_s16_u16(q12u16);
   1157 
   1158     q8s16 = vqaddq_s16(q8s16, q3s16);
   1159     q9s16 = vqaddq_s16(q9s16, q4s16);
   1160     q10s16 = vqaddq_s16(q10s16, q5s16);
   1161     q11s16 = vqaddq_s16(q11s16, q6s16);
   1162     q12s16 = vqaddq_s16(q12s16, q7s16);
   1163 
   1164     d26u8 = vqrshrun_n_s16(q8s16, 7);
   1165     d27u8 = vqrshrun_n_s16(q9s16, 7);
   1166     d28u8 = vqrshrun_n_s16(q10s16, 7);
   1167     d29u8 = vqrshrun_n_s16(q11s16, 7);
   1168     d30u8 = vqrshrun_n_s16(q12s16, 7);
   1169 
   1170     // Second pass: 8x8
   1171     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1172     d0s8 = vdup_lane_s8(dtmps8, 0);
   1173     d1s8 = vdup_lane_s8(dtmps8, 1);
   1174     d2s8 = vdup_lane_s8(dtmps8, 2);
   1175     d3s8 = vdup_lane_s8(dtmps8, 3);
   1176     d4s8 = vdup_lane_s8(dtmps8, 4);
   1177     d5s8 = vdup_lane_s8(dtmps8, 5);
   1178     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1179     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1180     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1181     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1182     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1183     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1184 
   1185     tmpp = tmp;
   1186     q9u8 = vld1q_u8(tmpp);
   1187     tmpp += 16;
   1188     q10u8 = vld1q_u8(tmpp);
   1189     tmpp += 16;
   1190     q11u8 = vld1q_u8(tmpp);
   1191     tmpp += 16;
   1192     q12u8 = vld1q_u8(tmpp);
   1193 
   1194     d18u8 = vget_low_u8(q9u8);
   1195     d19u8 = vget_high_u8(q9u8);
   1196     d20u8 = vget_low_u8(q10u8);
   1197     d21u8 = vget_high_u8(q10u8);
   1198     d22u8 = vget_low_u8(q11u8);
   1199     d23u8 = vget_high_u8(q11u8);
   1200     d24u8 = vget_low_u8(q12u8);
   1201     d25u8 = vget_high_u8(q12u8);
   1202 
   1203     for (i = 2; i > 0; i--) {
   1204         q3u16 = vmull_u8(d18u8, d0u8);
   1205         q4u16 = vmull_u8(d19u8, d0u8);
   1206         q5u16 = vmull_u8(d20u8, d0u8);
   1207         q6u16 = vmull_u8(d21u8, d0u8);
   1208 
   1209         q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1210         q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1211         q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1212         q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1213 
   1214         q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1215         q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1216         q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1217         q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1218 
   1219         q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1220         q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1221         q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1222         q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1223 
   1224         q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1225         q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1226         q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1227         q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1228 
   1229         q7u16 = vmull_u8(d21u8, d3u8);
   1230         q8u16 = vmull_u8(d22u8, d3u8);
   1231         q9u16 = vmull_u8(d23u8, d3u8);
   1232         q10u16 = vmull_u8(d24u8, d3u8);
   1233 
   1234         q3s16 = vreinterpretq_s16_u16(q3u16);
   1235         q4s16 = vreinterpretq_s16_u16(q4u16);
   1236         q5s16 = vreinterpretq_s16_u16(q5u16);
   1237         q6s16 = vreinterpretq_s16_u16(q6u16);
   1238         q7s16 = vreinterpretq_s16_u16(q7u16);
   1239         q8s16 = vreinterpretq_s16_u16(q8u16);
   1240         q9s16 = vreinterpretq_s16_u16(q9u16);
   1241         q10s16 = vreinterpretq_s16_u16(q10u16);
   1242 
   1243         q7s16 = vqaddq_s16(q7s16, q3s16);
   1244         q8s16 = vqaddq_s16(q8s16, q4s16);
   1245         q9s16 = vqaddq_s16(q9s16, q5s16);
   1246         q10s16 = vqaddq_s16(q10s16, q6s16);
   1247 
   1248         d6u8 = vqrshrun_n_s16(q7s16, 7);
   1249         d7u8 = vqrshrun_n_s16(q8s16, 7);
   1250         d8u8 = vqrshrun_n_s16(q9s16, 7);
   1251         d9u8 = vqrshrun_n_s16(q10s16, 7);
   1252 
   1253         d18u8 = d22u8;
   1254         d19u8 = d23u8;
   1255         d20u8 = d24u8;
   1256         d21u8 = d25u8;
   1257         d22u8 = d26u8;
   1258         d23u8 = d27u8;
   1259         d24u8 = d28u8;
   1260         d25u8 = d29u8;
   1261         d26u8 = d30u8;
   1262 
   1263         vst1_u8(dst_ptr, d6u8);
   1264         dst_ptr += dst_pitch;
   1265         vst1_u8(dst_ptr, d7u8);
   1266         dst_ptr += dst_pitch;
   1267         vst1_u8(dst_ptr, d8u8);
   1268         dst_ptr += dst_pitch;
   1269         vst1_u8(dst_ptr, d9u8);
   1270         dst_ptr += dst_pitch;
   1271     }
   1272     return;
   1273 }
   1274 
   1275 void vp8_sixtap_predict16x16_neon(
   1276         unsigned char *src_ptr,
   1277         int src_pixels_per_line,
   1278         int xoffset,
   1279         int yoffset,
   1280         unsigned char *dst_ptr,
   1281         int dst_pitch) {
   1282     unsigned char *src, *src_tmp, *dst, *tmpp;
   1283     unsigned char tmp[336];
   1284     int i, j;
   1285     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
   1286     uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
   1287     uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
   1288     uint8x8_t d28u8, d29u8, d30u8, d31u8;
   1289     int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
   1290     uint8x16_t q3u8, q4u8;
   1291     uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
   1292     uint16x8_t q11u16, q12u16, q13u16, q15u16;
   1293     int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
   1294     int16x8_t q11s16, q12s16, q13s16, q15s16;
   1295 
   1296     if (xoffset == 0) {  // secondpass_filter8x8_only
   1297         // load second_pass filter
   1298         dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1299         d0s8 = vdup_lane_s8(dtmps8, 0);
   1300         d1s8 = vdup_lane_s8(dtmps8, 1);
   1301         d2s8 = vdup_lane_s8(dtmps8, 2);
   1302         d3s8 = vdup_lane_s8(dtmps8, 3);
   1303         d4s8 = vdup_lane_s8(dtmps8, 4);
   1304         d5s8 = vdup_lane_s8(dtmps8, 5);
   1305         d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1306         d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1307         d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1308         d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1309         d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1310         d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1311 
   1312         // load src data
   1313         src_tmp = src_ptr - src_pixels_per_line * 2;
   1314         for (i = 0; i < 2; i++) {
   1315             src = src_tmp + i * 8;
   1316             dst = dst_ptr + i * 8;
   1317             d18u8 = vld1_u8(src);
   1318             src += src_pixels_per_line;
   1319             d19u8 = vld1_u8(src);
   1320             src += src_pixels_per_line;
   1321             d20u8 = vld1_u8(src);
   1322             src += src_pixels_per_line;
   1323             d21u8 = vld1_u8(src);
   1324             src += src_pixels_per_line;
   1325             d22u8 = vld1_u8(src);
   1326             src += src_pixels_per_line;
   1327             for (j = 0; j < 4; j++) {
   1328                 d23u8 = vld1_u8(src);
   1329                 src += src_pixels_per_line;
   1330                 d24u8 = vld1_u8(src);
   1331                 src += src_pixels_per_line;
   1332                 d25u8 = vld1_u8(src);
   1333                 src += src_pixels_per_line;
   1334                 d26u8 = vld1_u8(src);
   1335                 src += src_pixels_per_line;
   1336 
   1337                 q3u16 = vmull_u8(d18u8, d0u8);
   1338                 q4u16 = vmull_u8(d19u8, d0u8);
   1339                 q5u16 = vmull_u8(d20u8, d0u8);
   1340                 q6u16 = vmull_u8(d21u8, d0u8);
   1341 
   1342                 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1343                 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1344                 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1345                 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1346 
   1347                 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1348                 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1349                 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1350                 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1351 
   1352                 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1353                 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1354                 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1355                 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1356 
   1357                 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1358                 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1359                 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1360                 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1361 
   1362                 q7u16 = vmull_u8(d21u8, d3u8);
   1363                 q8u16 = vmull_u8(d22u8, d3u8);
   1364                 q9u16 = vmull_u8(d23u8, d3u8);
   1365                 q10u16 = vmull_u8(d24u8, d3u8);
   1366 
   1367                 q3s16 = vreinterpretq_s16_u16(q3u16);
   1368                 q4s16 = vreinterpretq_s16_u16(q4u16);
   1369                 q5s16 = vreinterpretq_s16_u16(q5u16);
   1370                 q6s16 = vreinterpretq_s16_u16(q6u16);
   1371                 q7s16 = vreinterpretq_s16_u16(q7u16);
   1372                 q8s16 = vreinterpretq_s16_u16(q8u16);
   1373                 q9s16 = vreinterpretq_s16_u16(q9u16);
   1374                 q10s16 = vreinterpretq_s16_u16(q10u16);
   1375 
   1376                 q7s16 = vqaddq_s16(q7s16, q3s16);
   1377                 q8s16 = vqaddq_s16(q8s16, q4s16);
   1378                 q9s16 = vqaddq_s16(q9s16, q5s16);
   1379                 q10s16 = vqaddq_s16(q10s16, q6s16);
   1380 
   1381                 d6u8 = vqrshrun_n_s16(q7s16, 7);
   1382                 d7u8 = vqrshrun_n_s16(q8s16, 7);
   1383                 d8u8 = vqrshrun_n_s16(q9s16, 7);
   1384                 d9u8 = vqrshrun_n_s16(q10s16, 7);
   1385 
   1386                 d18u8 = d22u8;
   1387                 d19u8 = d23u8;
   1388                 d20u8 = d24u8;
   1389                 d21u8 = d25u8;
   1390                 d22u8 = d26u8;
   1391 
   1392                 vst1_u8(dst, d6u8);
   1393                 dst += dst_pitch;
   1394                 vst1_u8(dst, d7u8);
   1395                 dst += dst_pitch;
   1396                 vst1_u8(dst, d8u8);
   1397                 dst += dst_pitch;
   1398                 vst1_u8(dst, d9u8);
   1399                 dst += dst_pitch;
   1400             }
   1401         }
   1402         return;
   1403     }
   1404 
   1405     // load first_pass filter
   1406     dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
   1407     d0s8 = vdup_lane_s8(dtmps8, 0);
   1408     d1s8 = vdup_lane_s8(dtmps8, 1);
   1409     d2s8 = vdup_lane_s8(dtmps8, 2);
   1410     d3s8 = vdup_lane_s8(dtmps8, 3);
   1411     d4s8 = vdup_lane_s8(dtmps8, 4);
   1412     d5s8 = vdup_lane_s8(dtmps8, 5);
   1413     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1414     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1415     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1416     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1417     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1418     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1419 
   1420     // First pass: output_height lines x output_width columns (9x4)
   1421     if (yoffset == 0) {  // firstpass_filter4x4_only
   1422         src = src_ptr - 2;
   1423         dst = dst_ptr;
   1424         for (i = 0; i < 8; i++) {
   1425             d6u8 = vld1_u8(src);
   1426             d7u8 = vld1_u8(src + 8);
   1427             d8u8 = vld1_u8(src + 16);
   1428             src += src_pixels_per_line;
   1429             d9u8 = vld1_u8(src);
   1430             d10u8 = vld1_u8(src + 8);
   1431             d11u8 = vld1_u8(src + 16);
   1432             src += src_pixels_per_line;
   1433 
   1434             __builtin_prefetch(src);
   1435             __builtin_prefetch(src + src_pixels_per_line);
   1436 
   1437             q6u16 = vmull_u8(d6u8, d0u8);
   1438             q7u16 = vmull_u8(d7u8, d0u8);
   1439             q8u16 = vmull_u8(d9u8, d0u8);
   1440             q9u16 = vmull_u8(d10u8, d0u8);
   1441 
   1442             d20u8 = vext_u8(d6u8, d7u8, 1);
   1443             d21u8 = vext_u8(d9u8, d10u8, 1);
   1444             d22u8 = vext_u8(d7u8, d8u8, 1);
   1445             d23u8 = vext_u8(d10u8, d11u8, 1);
   1446             d24u8 = vext_u8(d6u8, d7u8, 4);
   1447             d25u8 = vext_u8(d9u8, d10u8, 4);
   1448             d26u8 = vext_u8(d7u8, d8u8, 4);
   1449             d27u8 = vext_u8(d10u8, d11u8, 4);
   1450             d28u8 = vext_u8(d6u8, d7u8, 5);
   1451             d29u8 = vext_u8(d9u8, d10u8, 5);
   1452 
   1453             q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
   1454             q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
   1455             q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
   1456             q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
   1457             q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
   1458             q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
   1459             q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
   1460             q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
   1461             q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
   1462             q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
   1463 
   1464             d20u8 = vext_u8(d7u8, d8u8, 5);
   1465             d21u8 = vext_u8(d10u8, d11u8, 5);
   1466             d22u8 = vext_u8(d6u8, d7u8, 2);
   1467             d23u8 = vext_u8(d9u8, d10u8, 2);
   1468             d24u8 = vext_u8(d7u8, d8u8, 2);
   1469             d25u8 = vext_u8(d10u8, d11u8, 2);
   1470             d26u8 = vext_u8(d6u8, d7u8, 3);
   1471             d27u8 = vext_u8(d9u8, d10u8, 3);
   1472             d28u8 = vext_u8(d7u8, d8u8, 3);
   1473             d29u8 = vext_u8(d10u8, d11u8, 3);
   1474 
   1475             q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
   1476             q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
   1477             q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
   1478             q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
   1479             q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
   1480             q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
   1481 
   1482             q10u16 = vmull_u8(d26u8, d3u8);
   1483             q11u16 = vmull_u8(d27u8, d3u8);
   1484             q12u16 = vmull_u8(d28u8, d3u8);
   1485             q15u16 = vmull_u8(d29u8, d3u8);
   1486 
   1487             q6s16 = vreinterpretq_s16_u16(q6u16);
   1488             q7s16 = vreinterpretq_s16_u16(q7u16);
   1489             q8s16 = vreinterpretq_s16_u16(q8u16);
   1490             q9s16 = vreinterpretq_s16_u16(q9u16);
   1491             q10s16 = vreinterpretq_s16_u16(q10u16);
   1492             q11s16 = vreinterpretq_s16_u16(q11u16);
   1493             q12s16 = vreinterpretq_s16_u16(q12u16);
   1494             q15s16 = vreinterpretq_s16_u16(q15u16);
   1495 
   1496             q6s16 = vqaddq_s16(q6s16, q10s16);
   1497             q8s16 = vqaddq_s16(q8s16, q11s16);
   1498             q7s16 = vqaddq_s16(q7s16, q12s16);
   1499             q9s16 = vqaddq_s16(q9s16, q15s16);
   1500 
   1501             d6u8 = vqrshrun_n_s16(q6s16, 7);
   1502             d7u8 = vqrshrun_n_s16(q7s16, 7);
   1503             d8u8 = vqrshrun_n_s16(q8s16, 7);
   1504             d9u8 = vqrshrun_n_s16(q9s16, 7);
   1505 
   1506             q3u8 = vcombine_u8(d6u8, d7u8);
   1507             q4u8 = vcombine_u8(d8u8, d9u8);
   1508             vst1q_u8(dst, q3u8);
   1509             dst += dst_pitch;
   1510             vst1q_u8(dst, q4u8);
   1511             dst += dst_pitch;
   1512         }
   1513         return;
   1514     }
   1515 
   1516     src = src_ptr - 2 - src_pixels_per_line * 2;
   1517     tmpp = tmp;
   1518     for (i = 0; i < 7; i++) {
   1519         d6u8 = vld1_u8(src);
   1520         d7u8 = vld1_u8(src + 8);
   1521         d8u8 = vld1_u8(src + 16);
   1522         src += src_pixels_per_line;
   1523         d9u8 = vld1_u8(src);
   1524         d10u8 = vld1_u8(src + 8);
   1525         d11u8 = vld1_u8(src + 16);
   1526         src += src_pixels_per_line;
   1527         d12u8 = vld1_u8(src);
   1528         d13u8 = vld1_u8(src + 8);
   1529         d14u8 = vld1_u8(src + 16);
   1530         src += src_pixels_per_line;
   1531 
   1532         __builtin_prefetch(src);
   1533         __builtin_prefetch(src + src_pixels_per_line);
   1534         __builtin_prefetch(src + src_pixels_per_line * 2);
   1535 
   1536         q8u16 = vmull_u8(d6u8, d0u8);
   1537         q9u16 = vmull_u8(d7u8, d0u8);
   1538         q10u16 = vmull_u8(d9u8, d0u8);
   1539         q11u16 = vmull_u8(d10u8, d0u8);
   1540         q12u16 = vmull_u8(d12u8, d0u8);
   1541         q13u16 = vmull_u8(d13u8, d0u8);
   1542 
   1543         d28u8 = vext_u8(d6u8, d7u8, 1);
   1544         d29u8 = vext_u8(d9u8, d10u8, 1);
   1545         d30u8 = vext_u8(d12u8, d13u8, 1);
   1546         q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
   1547         q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
   1548         q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
   1549         d28u8 = vext_u8(d7u8, d8u8, 1);
   1550         d29u8 = vext_u8(d10u8, d11u8, 1);
   1551         d30u8 = vext_u8(d13u8, d14u8, 1);
   1552         q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
   1553         q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
   1554         q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
   1555 
   1556         d28u8 = vext_u8(d6u8, d7u8, 4);
   1557         d29u8 = vext_u8(d9u8, d10u8, 4);
   1558         d30u8 = vext_u8(d12u8, d13u8, 4);
   1559         q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
   1560         q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
   1561         q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
   1562         d28u8 = vext_u8(d7u8, d8u8, 4);
   1563         d29u8 = vext_u8(d10u8, d11u8, 4);
   1564         d30u8 = vext_u8(d13u8, d14u8, 4);
   1565         q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
   1566         q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
   1567         q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
   1568 
   1569         d28u8 = vext_u8(d6u8, d7u8, 5);
   1570         d29u8 = vext_u8(d9u8, d10u8, 5);
   1571         d30u8 = vext_u8(d12u8, d13u8, 5);
   1572         q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
   1573         q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
   1574         q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
   1575         d28u8 = vext_u8(d7u8, d8u8, 5);
   1576         d29u8 = vext_u8(d10u8, d11u8, 5);
   1577         d30u8 = vext_u8(d13u8, d14u8, 5);
   1578         q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
   1579         q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
   1580         q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
   1581 
   1582         d28u8 = vext_u8(d6u8, d7u8, 2);
   1583         d29u8 = vext_u8(d9u8, d10u8, 2);
   1584         d30u8 = vext_u8(d12u8, d13u8, 2);
   1585         q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
   1586         q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
   1587         q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
   1588         d28u8 = vext_u8(d7u8, d8u8, 2);
   1589         d29u8 = vext_u8(d10u8, d11u8, 2);
   1590         d30u8 = vext_u8(d13u8, d14u8, 2);
   1591         q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
   1592         q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
   1593         q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
   1594 
   1595         d28u8 = vext_u8(d6u8, d7u8, 3);
   1596         d29u8 = vext_u8(d9u8, d10u8, 3);
   1597         d30u8 = vext_u8(d12u8, d13u8, 3);
   1598         d15u8 = vext_u8(d7u8, d8u8, 3);
   1599         d31u8 = vext_u8(d10u8, d11u8, 3);
   1600         d6u8  = vext_u8(d13u8, d14u8, 3);
   1601         q4u16 = vmull_u8(d28u8, d3u8);
   1602         q5u16 = vmull_u8(d29u8, d3u8);
   1603         q6u16 = vmull_u8(d30u8, d3u8);
   1604         q4s16 = vreinterpretq_s16_u16(q4u16);
   1605         q5s16 = vreinterpretq_s16_u16(q5u16);
   1606         q6s16 = vreinterpretq_s16_u16(q6u16);
   1607         q8s16 = vreinterpretq_s16_u16(q8u16);
   1608         q10s16 = vreinterpretq_s16_u16(q10u16);
   1609         q12s16 = vreinterpretq_s16_u16(q12u16);
   1610         q8s16 = vqaddq_s16(q8s16, q4s16);
   1611         q10s16 = vqaddq_s16(q10s16, q5s16);
   1612         q12s16 = vqaddq_s16(q12s16, q6s16);
   1613 
   1614         q6u16 = vmull_u8(d15u8, d3u8);
   1615         q7u16 = vmull_u8(d31u8, d3u8);
   1616         q3u16 = vmull_u8(d6u8, d3u8);
   1617         q3s16 = vreinterpretq_s16_u16(q3u16);
   1618         q6s16 = vreinterpretq_s16_u16(q6u16);
   1619         q7s16 = vreinterpretq_s16_u16(q7u16);
   1620         q9s16 = vreinterpretq_s16_u16(q9u16);
   1621         q11s16 = vreinterpretq_s16_u16(q11u16);
   1622         q13s16 = vreinterpretq_s16_u16(q13u16);
   1623         q9s16 = vqaddq_s16(q9s16, q6s16);
   1624         q11s16 = vqaddq_s16(q11s16, q7s16);
   1625         q13s16 = vqaddq_s16(q13s16, q3s16);
   1626 
   1627         d6u8 = vqrshrun_n_s16(q8s16, 7);
   1628         d7u8 = vqrshrun_n_s16(q9s16, 7);
   1629         d8u8 = vqrshrun_n_s16(q10s16, 7);
   1630         d9u8 = vqrshrun_n_s16(q11s16, 7);
   1631         d10u8 = vqrshrun_n_s16(q12s16, 7);
   1632         d11u8 = vqrshrun_n_s16(q13s16, 7);
   1633 
   1634         vst1_u8(tmpp, d6u8);
   1635         tmpp += 8;
   1636         vst1_u8(tmpp, d7u8);
   1637         tmpp += 8;
   1638         vst1_u8(tmpp, d8u8);
   1639         tmpp += 8;
   1640         vst1_u8(tmpp, d9u8);
   1641         tmpp += 8;
   1642         vst1_u8(tmpp, d10u8);
   1643         tmpp += 8;
   1644         vst1_u8(tmpp, d11u8);
   1645         tmpp += 8;
   1646     }
   1647 
   1648     // Second pass: 16x16
   1649     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1650     d0s8 = vdup_lane_s8(dtmps8, 0);
   1651     d1s8 = vdup_lane_s8(dtmps8, 1);
   1652     d2s8 = vdup_lane_s8(dtmps8, 2);
   1653     d3s8 = vdup_lane_s8(dtmps8, 3);
   1654     d4s8 = vdup_lane_s8(dtmps8, 4);
   1655     d5s8 = vdup_lane_s8(dtmps8, 5);
   1656     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1657     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1658     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1659     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1660     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1661     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1662 
   1663     for (i = 0; i < 2; i++) {
   1664         dst = dst_ptr + 8 * i;
   1665         tmpp = tmp + 8 * i;
   1666         d18u8 = vld1_u8(tmpp);
   1667         tmpp += 16;
   1668         d19u8 = vld1_u8(tmpp);
   1669         tmpp += 16;
   1670         d20u8 = vld1_u8(tmpp);
   1671         tmpp += 16;
   1672         d21u8 = vld1_u8(tmpp);
   1673         tmpp += 16;
   1674         d22u8 = vld1_u8(tmpp);
   1675         tmpp += 16;
   1676         for (j = 0; j < 4; j++) {
   1677             d23u8 = vld1_u8(tmpp);
   1678             tmpp += 16;
   1679             d24u8 = vld1_u8(tmpp);
   1680             tmpp += 16;
   1681             d25u8 = vld1_u8(tmpp);
   1682             tmpp += 16;
   1683             d26u8 = vld1_u8(tmpp);
   1684             tmpp += 16;
   1685 
   1686             q3u16 = vmull_u8(d18u8, d0u8);
   1687             q4u16 = vmull_u8(d19u8, d0u8);
   1688             q5u16 = vmull_u8(d20u8, d0u8);
   1689             q6u16 = vmull_u8(d21u8, d0u8);
   1690 
   1691             q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1692             q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1693             q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1694             q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1695 
   1696             q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1697             q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1698             q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1699             q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1700 
   1701             q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1702             q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1703             q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1704             q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1705 
   1706             q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1707             q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1708             q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1709             q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1710 
   1711             q7u16 = vmull_u8(d21u8, d3u8);
   1712             q8u16 = vmull_u8(d22u8, d3u8);
   1713             q9u16 = vmull_u8(d23u8, d3u8);
   1714             q10u16 = vmull_u8(d24u8, d3u8);
   1715 
   1716             q3s16 = vreinterpretq_s16_u16(q3u16);
   1717             q4s16 = vreinterpretq_s16_u16(q4u16);
   1718             q5s16 = vreinterpretq_s16_u16(q5u16);
   1719             q6s16 = vreinterpretq_s16_u16(q6u16);
   1720             q7s16 = vreinterpretq_s16_u16(q7u16);
   1721             q8s16 = vreinterpretq_s16_u16(q8u16);
   1722             q9s16 = vreinterpretq_s16_u16(q9u16);
   1723             q10s16 = vreinterpretq_s16_u16(q10u16);
   1724 
   1725             q7s16 = vqaddq_s16(q7s16, q3s16);
   1726             q8s16 = vqaddq_s16(q8s16, q4s16);
   1727             q9s16 = vqaddq_s16(q9s16, q5s16);
   1728             q10s16 = vqaddq_s16(q10s16, q6s16);
   1729 
   1730             d6u8 = vqrshrun_n_s16(q7s16, 7);
   1731             d7u8 = vqrshrun_n_s16(q8s16, 7);
   1732             d8u8 = vqrshrun_n_s16(q9s16, 7);
   1733             d9u8 = vqrshrun_n_s16(q10s16, 7);
   1734 
   1735             d18u8 = d22u8;
   1736             d19u8 = d23u8;
   1737             d20u8 = d24u8;
   1738             d21u8 = d25u8;
   1739             d22u8 = d26u8;
   1740 
   1741             vst1_u8(dst, d6u8);
   1742             dst += dst_pitch;
   1743             vst1_u8(dst, d7u8);
   1744             dst += dst_pitch;
   1745             vst1_u8(dst, d8u8);
   1746             dst += dst_pitch;
   1747             vst1_u8(dst, d9u8);
   1748             dst += dst_pitch;
   1749         }
   1750     }
   1751     return;
   1752 }
   1753