Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include <string.h>
     13 #include "./vpx_config.h"
     14 #include "./vp8_rtcd.h"
     15 #include "vpx_dsp/arm/mem_neon.h"
     16 #include "vpx_ports/mem.h"
     17 
     18 static const int8_t vp8_sub_pel_filters[8][8] = {
     19   { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
     20   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
     21   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
     22   { 0, -9, 93, 50, -6, 0, 0, 0 },
     23   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
     24   { 0, -6, 50, 93, -9, 0, 0, 0 },
     25   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
     26   { 0, -1, 12, 123, -6, 0, 0, 0 },
     27 };
     28 
     29 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
     30 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
     31 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
     32 // multiply/accumulates which either add or subtract as needed. The other
     33 // functions will be updated to use this table later.
     34 // It is also expanded to 8 elements to allow loading into 64 bit neon
     35 // registers.
     36 static const uint8_t abs_filters[8][8] = {
     37   { 0, 0, 128, 0, 0, 0, 0, 0 },   { 0, 6, 123, 12, 1, 0, 0, 0 },
     38   { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
     39   { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
     40   { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
     41 };
     42 
     43 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
     44   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
     45 }
     46 
     47 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
     48                                          const uint8x8_t filter, uint16x8_t *c,
     49                                          uint16x8_t *d) {
     50   const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
     51                                        vreinterpret_u32_u8(vget_high_u8(a)));
     52   const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
     53                                        vreinterpret_u32_u8(vget_high_u8(b)));
     54   *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
     55   *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
     56 }
     57 
     58 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
     59                                          const uint8x8_t filter, uint16x8_t *c,
     60                                          uint16x8_t *d) {
     61   const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
     62                                        vreinterpret_u32_u8(vget_high_u8(a)));
     63   const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
     64                                        vreinterpret_u32_u8(vget_high_u8(b)));
     65   *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
     66   *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
     67 }
     68 
     69 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
     70                             int filter_offset, unsigned char *dst,
     71                             int dst_stride) {
     72   uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
     73   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
     74   uint16x8_t c0, c1, c2, c3;
     75   int16x8_t d0, d1;
     76   uint8x8_t e0, e1;
     77 
     78   const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
     79   const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
     80   const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
     81   const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
     82   const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
     83   const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
     84   const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
     85 
     86   src -= src_stride * 2;
     87   // Shift the even rows to allow using 'vext' to combine the vectors. armv8
     88   // has vcopy_lane which would be interesting. This started as just a
     89   // horrible workaround for clang adding alignment hints to 32bit loads:
     90   // https://llvm.org/bugs/show_bug.cgi?id=24421
     91   // But it turns out it almost identical to casting the loads.
     92   a0 = load_and_shift(src);
     93   src += src_stride;
     94   a1 = vld1_u8(src);
     95   src += src_stride;
     96   a2 = load_and_shift(src);
     97   src += src_stride;
     98   a3 = vld1_u8(src);
     99   src += src_stride;
    100   a4 = load_and_shift(src);
    101   src += src_stride;
    102   a5 = vld1_u8(src);
    103   src += src_stride;
    104   a6 = load_and_shift(src);
    105   src += src_stride;
    106   a7 = vld1_u8(src);
    107   src += src_stride;
    108   a8 = vld1_u8(src);
    109 
    110   // Combine the rows so we can operate on 8 at a time.
    111   b0 = vext_u8(a0, a1, 4);
    112   b2 = vext_u8(a2, a3, 4);
    113   b4 = vext_u8(a4, a5, 4);
    114   b6 = vext_u8(a6, a7, 4);
    115   b8 = a8;
    116 
    117   // To keep with the 8-at-a-time theme, combine *alternate* rows. This
    118   // allows combining the odd rows with the even.
    119   b1 = vext_u8(b0, b2, 4);
    120   b3 = vext_u8(b2, b4, 4);
    121   b5 = vext_u8(b4, b6, 4);
    122   b7 = vext_u8(b6, b8, 4);
    123 
    124   // Multiply and expand to 16 bits.
    125   c0 = vmull_u8(b0, filter0);
    126   c1 = vmull_u8(b2, filter0);
    127   c2 = vmull_u8(b5, filter5);
    128   c3 = vmull_u8(b7, filter5);
    129 
    130   // Multiply, subtract and accumulate for filters 1 and 4 (the negative
    131   // ones).
    132   c0 = vmlsl_u8(c0, b4, filter4);
    133   c1 = vmlsl_u8(c1, b6, filter4);
    134   c2 = vmlsl_u8(c2, b1, filter1);
    135   c3 = vmlsl_u8(c3, b3, filter1);
    136 
    137   // Add more positive ones. vmlal should really return a signed type.
    138   // It's doing signed math internally, as evidenced by the fact we can do
    139   // subtractions followed by more additions. Ideally we could use
    140   // vqmlal/sl but that instruction doesn't exist. Might be able to
    141   // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
    142   c0 = vmlal_u8(c0, b2, filter2);
    143   c1 = vmlal_u8(c1, b4, filter2);
    144   c2 = vmlal_u8(c2, b3, filter3);
    145   c3 = vmlal_u8(c3, b5, filter3);
    146 
    147   // Use signed saturation math because vmlsl may have left some negative
    148   // numbers in there.
    149   d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
    150   d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
    151 
    152   // Use signed again because numbers like -200 need to be saturated to 0.
    153   e0 = vqrshrun_n_s16(d0, 7);
    154   e1 = vqrshrun_n_s16(d1, 7);
    155 
    156   store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
    157 }
    158 
    159 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
    160                                 int xoffset, int yoffset,
    161                                 unsigned char *dst_ptr, int dst_pitch) {
    162   uint8x16_t s0, s1, s2, s3, s4;
    163   uint64x2_t s01, s23;
    164   // Variables to hold src[] elements for the given filter[]
    165   uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
    166   uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
    167   uint8x16_t s01_f0, s23_f0;
    168   uint64x2_t s01_f3, s23_f3;
    169   uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
    170   // Accumulator variables.
    171   uint16x8_t d0123, d4567, d89;
    172   uint16x8_t d0123_a, d4567_a, d89_a;
    173   int16x8_t e0123, e4567, e89;
    174   // Second pass intermediates.
    175   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
    176   uint16x8_t c0, c1, c2, c3;
    177   int16x8_t d0, d1;
    178   uint8x8_t e0, e1;
    179   uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
    180 
    181   if (xoffset == 0) {  // Second pass only.
    182     yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
    183     return;
    184   }
    185 
    186   if (yoffset == 0) {  // First pass only.
    187     src_ptr -= 2;
    188   } else {  // Add context for the second pass. 2 extra lines on top.
    189     src_ptr -= 2 + (src_pixels_per_line * 2);
    190   }
    191 
    192   filter = vld1_u8(abs_filters[xoffset]);
    193   filter0 = vdup_lane_u8(filter, 0);
    194   filter1 = vdup_lane_u8(filter, 1);
    195   filter2 = vdup_lane_u8(filter, 2);
    196   filter3 = vdup_lane_u8(filter, 3);
    197   filter4 = vdup_lane_u8(filter, 4);
    198   filter5 = vdup_lane_u8(filter, 5);
    199 
    200   // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
    201   // garbage. So much effort for that last single bit.
    202   // The low values of each pair are for filter0.
    203   s0 = vld1q_u8(src_ptr);
    204   src_ptr += src_pixels_per_line;
    205   s1 = vld1q_u8(src_ptr);
    206   src_ptr += src_pixels_per_line;
    207   s2 = vld1q_u8(src_ptr);
    208   src_ptr += src_pixels_per_line;
    209   s3 = vld1q_u8(src_ptr);
    210   src_ptr += src_pixels_per_line;
    211 
    212   // Shift to extract values for filter[5]
    213   // If src[] is 0, this puts:
    214   // 3 4 5 6 7 8 9 10 in s0_f5
    215   // Can't use vshr.u64 because it crosses the double word boundary.
    216   s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
    217   s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
    218   s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
    219   s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
    220 
    221   s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
    222   s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
    223 
    224   s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
    225   s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
    226   d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
    227   d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
    228 
    229   // Keep original src data as 64 bits to simplify shifting and extracting.
    230   s01 = vreinterpretq_u64_u8(s01_f0);
    231   s23 = vreinterpretq_u64_u8(s23_f0);
    232 
    233   // 3 4 5 6 * filter0
    234   filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
    235 
    236   // Shift over one to use -1, 0, 1, 2 for filter1
    237   // -1 0 1 2 * filter1
    238   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
    239                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
    240                         &d0123, &d4567);
    241 
    242   // 2 3 4 5 * filter4
    243   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
    244                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
    245                         &d0123, &d4567);
    246 
    247   // 0 1 2 3 * filter2
    248   filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
    249                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
    250                         &d0123, &d4567);
    251 
    252   // 1 2 3 4 * filter3
    253   s01_f3 = vshrq_n_u64(s01, 24);
    254   s23_f3 = vshrq_n_u64(s23, 24);
    255   s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
    256                       vreinterpret_u32_u64(vget_high_u64(s01_f3)));
    257   s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
    258                       vreinterpret_u32_u64(vget_high_u64(s23_f3)));
    259   // Accumulate into different registers so it can use saturated addition.
    260   d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
    261   d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
    262 
    263   e0123 =
    264       vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
    265   e4567 =
    266       vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
    267 
    268   // Shift and narrow.
    269   b0 = vqrshrun_n_s16(e0123, 7);
    270   b2 = vqrshrun_n_s16(e4567, 7);
    271 
    272   if (yoffset == 0) {  // firstpass_filter4x4_only
    273     store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
    274     return;
    275   }
    276 
    277   // Load additional context when doing both filters.
    278   s0 = vld1q_u8(src_ptr);
    279   src_ptr += src_pixels_per_line;
    280   s1 = vld1q_u8(src_ptr);
    281   src_ptr += src_pixels_per_line;
    282   s2 = vld1q_u8(src_ptr);
    283   src_ptr += src_pixels_per_line;
    284   s3 = vld1q_u8(src_ptr);
    285   src_ptr += src_pixels_per_line;
    286   s4 = vld1q_u8(src_ptr);
    287 
    288   s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
    289   s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
    290   s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
    291   s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
    292   s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
    293 
    294   // 3 4 5 6 * filter0
    295   s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
    296   s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
    297 
    298   s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
    299   s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
    300   // But this time instead of 16 pixels to filter, there are 20. So an extra
    301   // run with a doubleword register.
    302   d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
    303   d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
    304   d89 = vmull_u8(s4_f5, filter5);
    305 
    306   // Save a copy as u64 for shifting.
    307   s01 = vreinterpretq_u64_u8(s01_f0);
    308   s23 = vreinterpretq_u64_u8(s23_f0);
    309 
    310   filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
    311   d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
    312 
    313   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
    314                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
    315                         &d0123, &d4567);
    316   s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
    317   d89 = vmlsl_u8(d89, s4_f1, filter1);
    318 
    319   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
    320                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
    321                         &d0123, &d4567);
    322   s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
    323   d89 = vmlsl_u8(d89, s4_f4, filter4);
    324 
    325   filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
    326                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
    327                         &d0123, &d4567);
    328   s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
    329   d89 = vmlal_u8(d89, s4_f2, filter2);
    330 
    331   s01_f3 = vshrq_n_u64(s01, 24);
    332   s23_f3 = vshrq_n_u64(s23, 24);
    333   s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
    334                       vreinterpret_u32_u64(vget_high_u64(s01_f3)));
    335   s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
    336                       vreinterpret_u32_u64(vget_high_u64(s23_f3)));
    337   s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
    338   d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
    339   d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
    340   d89_a = vmull_u8(s4_f3, filter3);
    341 
    342   e0123 =
    343       vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
    344   e4567 =
    345       vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
    346   e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
    347 
    348   b4 = vqrshrun_n_s16(e0123, 7);
    349   b6 = vqrshrun_n_s16(e4567, 7);
    350   b8 = vqrshrun_n_s16(e89, 7);
    351 
    352   // Second pass: 4x4
    353   filter = vld1_u8(abs_filters[yoffset]);
    354   filter0 = vdup_lane_u8(filter, 0);
    355   filter1 = vdup_lane_u8(filter, 1);
    356   filter2 = vdup_lane_u8(filter, 2);
    357   filter3 = vdup_lane_u8(filter, 3);
    358   filter4 = vdup_lane_u8(filter, 4);
    359   filter5 = vdup_lane_u8(filter, 5);
    360 
    361   b1 = vext_u8(b0, b2, 4);
    362   b3 = vext_u8(b2, b4, 4);
    363   b5 = vext_u8(b4, b6, 4);
    364   b7 = vext_u8(b6, b8, 4);
    365 
    366   c0 = vmull_u8(b0, filter0);
    367   c1 = vmull_u8(b2, filter0);
    368   c2 = vmull_u8(b5, filter5);
    369   c3 = vmull_u8(b7, filter5);
    370 
    371   c0 = vmlsl_u8(c0, b4, filter4);
    372   c1 = vmlsl_u8(c1, b6, filter4);
    373   c2 = vmlsl_u8(c2, b1, filter1);
    374   c3 = vmlsl_u8(c3, b3, filter1);
    375 
    376   c0 = vmlal_u8(c0, b2, filter2);
    377   c1 = vmlal_u8(c1, b4, filter2);
    378   c2 = vmlal_u8(c2, b3, filter3);
    379   c3 = vmlal_u8(c3, b5, filter3);
    380 
    381   d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
    382   d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
    383 
    384   e0 = vqrshrun_n_s16(d0, 7);
    385   e1 = vqrshrun_n_s16(d1, 7);
    386 
    387   store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
    388 }
    389 
    390 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
    391                                 int xoffset, int yoffset,
    392                                 unsigned char *dst_ptr, int dst_pitch) {
    393   unsigned char *src;
    394   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    395   uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
    396   uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
    397   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    398   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
    399   uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
    400   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
    401   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
    402   uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
    403 
    404   if (xoffset == 0) {  // secondpass_filter8x4_only
    405     // load second_pass filter
    406     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    407     d0s8 = vdup_lane_s8(dtmps8, 0);
    408     d1s8 = vdup_lane_s8(dtmps8, 1);
    409     d2s8 = vdup_lane_s8(dtmps8, 2);
    410     d3s8 = vdup_lane_s8(dtmps8, 3);
    411     d4s8 = vdup_lane_s8(dtmps8, 4);
    412     d5s8 = vdup_lane_s8(dtmps8, 5);
    413     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    414     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    415     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    416     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    417     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    418     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    419 
    420     // load src data
    421     src = src_ptr - src_pixels_per_line * 2;
    422     d22u8 = vld1_u8(src);
    423     src += src_pixels_per_line;
    424     d23u8 = vld1_u8(src);
    425     src += src_pixels_per_line;
    426     d24u8 = vld1_u8(src);
    427     src += src_pixels_per_line;
    428     d25u8 = vld1_u8(src);
    429     src += src_pixels_per_line;
    430     d26u8 = vld1_u8(src);
    431     src += src_pixels_per_line;
    432     d27u8 = vld1_u8(src);
    433     src += src_pixels_per_line;
    434     d28u8 = vld1_u8(src);
    435     src += src_pixels_per_line;
    436     d29u8 = vld1_u8(src);
    437     src += src_pixels_per_line;
    438     d30u8 = vld1_u8(src);
    439 
    440     q3u16 = vmull_u8(d22u8, d0u8);
    441     q4u16 = vmull_u8(d23u8, d0u8);
    442     q5u16 = vmull_u8(d24u8, d0u8);
    443     q6u16 = vmull_u8(d25u8, d0u8);
    444 
    445     q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
    446     q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
    447     q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
    448     q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
    449 
    450     q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
    451     q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
    452     q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
    453     q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
    454 
    455     q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
    456     q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
    457     q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
    458     q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
    459 
    460     q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
    461     q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
    462     q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
    463     q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
    464 
    465     q7u16 = vmull_u8(d25u8, d3u8);
    466     q8u16 = vmull_u8(d26u8, d3u8);
    467     q9u16 = vmull_u8(d27u8, d3u8);
    468     q10u16 = vmull_u8(d28u8, d3u8);
    469 
    470     q3s16 = vreinterpretq_s16_u16(q3u16);
    471     q4s16 = vreinterpretq_s16_u16(q4u16);
    472     q5s16 = vreinterpretq_s16_u16(q5u16);
    473     q6s16 = vreinterpretq_s16_u16(q6u16);
    474     q7s16 = vreinterpretq_s16_u16(q7u16);
    475     q8s16 = vreinterpretq_s16_u16(q8u16);
    476     q9s16 = vreinterpretq_s16_u16(q9u16);
    477     q10s16 = vreinterpretq_s16_u16(q10u16);
    478 
    479     q7s16 = vqaddq_s16(q7s16, q3s16);
    480     q8s16 = vqaddq_s16(q8s16, q4s16);
    481     q9s16 = vqaddq_s16(q9s16, q5s16);
    482     q10s16 = vqaddq_s16(q10s16, q6s16);
    483 
    484     d6u8 = vqrshrun_n_s16(q7s16, 7);
    485     d7u8 = vqrshrun_n_s16(q8s16, 7);
    486     d8u8 = vqrshrun_n_s16(q9s16, 7);
    487     d9u8 = vqrshrun_n_s16(q10s16, 7);
    488 
    489     vst1_u8(dst_ptr, d6u8);
    490     dst_ptr += dst_pitch;
    491     vst1_u8(dst_ptr, d7u8);
    492     dst_ptr += dst_pitch;
    493     vst1_u8(dst_ptr, d8u8);
    494     dst_ptr += dst_pitch;
    495     vst1_u8(dst_ptr, d9u8);
    496     return;
    497   }
    498 
    499   // load first_pass filter
    500   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    501   d0s8 = vdup_lane_s8(dtmps8, 0);
    502   d1s8 = vdup_lane_s8(dtmps8, 1);
    503   d2s8 = vdup_lane_s8(dtmps8, 2);
    504   d3s8 = vdup_lane_s8(dtmps8, 3);
    505   d4s8 = vdup_lane_s8(dtmps8, 4);
    506   d5s8 = vdup_lane_s8(dtmps8, 5);
    507   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    508   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    509   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    510   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    511   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    512   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    513 
    514   // First pass: output_height lines x output_width columns (9x4)
    515   if (yoffset == 0)  // firstpass_filter4x4_only
    516     src = src_ptr - 2;
    517   else
    518     src = src_ptr - 2 - (src_pixels_per_line * 2);
    519   q3u8 = vld1q_u8(src);
    520   src += src_pixels_per_line;
    521   q4u8 = vld1q_u8(src);
    522   src += src_pixels_per_line;
    523   q5u8 = vld1q_u8(src);
    524   src += src_pixels_per_line;
    525   q6u8 = vld1q_u8(src);
    526 
    527   q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    528   q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    529   q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    530   q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    531 
    532   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    533   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    534   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    535   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    536 
    537   q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
    538   q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
    539   q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
    540   q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
    541 
    542   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    543   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    544   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    545   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    546 
    547   q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
    548   q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
    549   q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
    550   q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
    551 
    552   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    553   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    554   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    555   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    556 
    557   q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
    558   q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
    559   q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
    560   q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
    561 
    562   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    563   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    564   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    565   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    566 
    567   q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
    568   q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
    569   q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
    570   q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
    571 
    572   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
    573   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
    574   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
    575   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
    576 
    577   q3u16 = vmull_u8(d28u8, d3u8);
    578   q4u16 = vmull_u8(d29u8, d3u8);
    579   q5u16 = vmull_u8(d30u8, d3u8);
    580   q6u16 = vmull_u8(d31u8, d3u8);
    581 
    582   q3s16 = vreinterpretq_s16_u16(q3u16);
    583   q4s16 = vreinterpretq_s16_u16(q4u16);
    584   q5s16 = vreinterpretq_s16_u16(q5u16);
    585   q6s16 = vreinterpretq_s16_u16(q6u16);
    586   q7s16 = vreinterpretq_s16_u16(q7u16);
    587   q8s16 = vreinterpretq_s16_u16(q8u16);
    588   q9s16 = vreinterpretq_s16_u16(q9u16);
    589   q10s16 = vreinterpretq_s16_u16(q10u16);
    590 
    591   q7s16 = vqaddq_s16(q7s16, q3s16);
    592   q8s16 = vqaddq_s16(q8s16, q4s16);
    593   q9s16 = vqaddq_s16(q9s16, q5s16);
    594   q10s16 = vqaddq_s16(q10s16, q6s16);
    595 
    596   d22u8 = vqrshrun_n_s16(q7s16, 7);
    597   d23u8 = vqrshrun_n_s16(q8s16, 7);
    598   d24u8 = vqrshrun_n_s16(q9s16, 7);
    599   d25u8 = vqrshrun_n_s16(q10s16, 7);
    600 
    601   if (yoffset == 0) {  // firstpass_filter8x4_only
    602     vst1_u8(dst_ptr, d22u8);
    603     dst_ptr += dst_pitch;
    604     vst1_u8(dst_ptr, d23u8);
    605     dst_ptr += dst_pitch;
    606     vst1_u8(dst_ptr, d24u8);
    607     dst_ptr += dst_pitch;
    608     vst1_u8(dst_ptr, d25u8);
    609     return;
    610   }
    611 
    612   // First Pass on rest 5-line data
    613   src += src_pixels_per_line;
    614   q3u8 = vld1q_u8(src);
    615   src += src_pixels_per_line;
    616   q4u8 = vld1q_u8(src);
    617   src += src_pixels_per_line;
    618   q5u8 = vld1q_u8(src);
    619   src += src_pixels_per_line;
    620   q6u8 = vld1q_u8(src);
    621   src += src_pixels_per_line;
    622   q7u8 = vld1q_u8(src);
    623 
    624   q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    625   q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    626   q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    627   q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    628   q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
    629 
    630   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    631   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    632   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    633   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    634   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
    635 
    636   q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
    637   q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
    638   q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
    639   q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
    640   q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
    641 
    642   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    643   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    644   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    645   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    646   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
    647 
    648   q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
    649   q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
    650   q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
    651   q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
    652   q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
    653 
    654   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    655   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    656   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    657   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    658   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
    659 
    660   q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
    661   q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
    662   q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
    663   q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
    664   q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
    665 
    666   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    667   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    668   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    669   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    670   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
    671 
    672   q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
    673   q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
    674   q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
    675   q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
    676   q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
    677 
    678   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
    679   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
    680   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
    681   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
    682   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
    683 
    684   q3u16 = vmull_u8(d27u8, d3u8);
    685   q4u16 = vmull_u8(d28u8, d3u8);
    686   q5u16 = vmull_u8(d29u8, d3u8);
    687   q6u16 = vmull_u8(d30u8, d3u8);
    688   q7u16 = vmull_u8(d31u8, d3u8);
    689 
    690   q3s16 = vreinterpretq_s16_u16(q3u16);
    691   q4s16 = vreinterpretq_s16_u16(q4u16);
    692   q5s16 = vreinterpretq_s16_u16(q5u16);
    693   q6s16 = vreinterpretq_s16_u16(q6u16);
    694   q7s16 = vreinterpretq_s16_u16(q7u16);
    695   q8s16 = vreinterpretq_s16_u16(q8u16);
    696   q9s16 = vreinterpretq_s16_u16(q9u16);
    697   q10s16 = vreinterpretq_s16_u16(q10u16);
    698   q11s16 = vreinterpretq_s16_u16(q11u16);
    699   q12s16 = vreinterpretq_s16_u16(q12u16);
    700 
    701   q8s16 = vqaddq_s16(q8s16, q3s16);
    702   q9s16 = vqaddq_s16(q9s16, q4s16);
    703   q10s16 = vqaddq_s16(q10s16, q5s16);
    704   q11s16 = vqaddq_s16(q11s16, q6s16);
    705   q12s16 = vqaddq_s16(q12s16, q7s16);
    706 
    707   d26u8 = vqrshrun_n_s16(q8s16, 7);
    708   d27u8 = vqrshrun_n_s16(q9s16, 7);
    709   d28u8 = vqrshrun_n_s16(q10s16, 7);
    710   d29u8 = vqrshrun_n_s16(q11s16, 7);
    711   d30u8 = vqrshrun_n_s16(q12s16, 7);
    712 
    713   // Second pass: 8x4
    714   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    715   d0s8 = vdup_lane_s8(dtmps8, 0);
    716   d1s8 = vdup_lane_s8(dtmps8, 1);
    717   d2s8 = vdup_lane_s8(dtmps8, 2);
    718   d3s8 = vdup_lane_s8(dtmps8, 3);
    719   d4s8 = vdup_lane_s8(dtmps8, 4);
    720   d5s8 = vdup_lane_s8(dtmps8, 5);
    721   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    722   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    723   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    724   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    725   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    726   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    727 
    728   q3u16 = vmull_u8(d22u8, d0u8);
    729   q4u16 = vmull_u8(d23u8, d0u8);
    730   q5u16 = vmull_u8(d24u8, d0u8);
    731   q6u16 = vmull_u8(d25u8, d0u8);
    732 
    733   q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
    734   q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
    735   q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
    736   q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
    737 
    738   q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
    739   q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
    740   q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
    741   q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
    742 
    743   q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
    744   q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
    745   q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
    746   q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
    747 
    748   q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
    749   q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
    750   q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
    751   q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
    752 
    753   q7u16 = vmull_u8(d25u8, d3u8);
    754   q8u16 = vmull_u8(d26u8, d3u8);
    755   q9u16 = vmull_u8(d27u8, d3u8);
    756   q10u16 = vmull_u8(d28u8, d3u8);
    757 
    758   q3s16 = vreinterpretq_s16_u16(q3u16);
    759   q4s16 = vreinterpretq_s16_u16(q4u16);
    760   q5s16 = vreinterpretq_s16_u16(q5u16);
    761   q6s16 = vreinterpretq_s16_u16(q6u16);
    762   q7s16 = vreinterpretq_s16_u16(q7u16);
    763   q8s16 = vreinterpretq_s16_u16(q8u16);
    764   q9s16 = vreinterpretq_s16_u16(q9u16);
    765   q10s16 = vreinterpretq_s16_u16(q10u16);
    766 
    767   q7s16 = vqaddq_s16(q7s16, q3s16);
    768   q8s16 = vqaddq_s16(q8s16, q4s16);
    769   q9s16 = vqaddq_s16(q9s16, q5s16);
    770   q10s16 = vqaddq_s16(q10s16, q6s16);
    771 
    772   d6u8 = vqrshrun_n_s16(q7s16, 7);
    773   d7u8 = vqrshrun_n_s16(q8s16, 7);
    774   d8u8 = vqrshrun_n_s16(q9s16, 7);
    775   d9u8 = vqrshrun_n_s16(q10s16, 7);
    776 
    777   vst1_u8(dst_ptr, d6u8);
    778   dst_ptr += dst_pitch;
    779   vst1_u8(dst_ptr, d7u8);
    780   dst_ptr += dst_pitch;
    781   vst1_u8(dst_ptr, d8u8);
    782   dst_ptr += dst_pitch;
    783   vst1_u8(dst_ptr, d9u8);
    784   return;
    785 }
    786 
    787 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
    788                                 int xoffset, int yoffset,
    789                                 unsigned char *dst_ptr, int dst_pitch) {
    790   unsigned char *src, *tmpp;
    791   unsigned char tmp[64];
    792   int i;
    793   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    794   uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
    795   uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
    796   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    797   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
    798   uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
    799   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
    800   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
    801   uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
    802 
    803   if (xoffset == 0) {  // secondpass_filter8x8_only
    804     // load second_pass filter
    805     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    806     d0s8 = vdup_lane_s8(dtmps8, 0);
    807     d1s8 = vdup_lane_s8(dtmps8, 1);
    808     d2s8 = vdup_lane_s8(dtmps8, 2);
    809     d3s8 = vdup_lane_s8(dtmps8, 3);
    810     d4s8 = vdup_lane_s8(dtmps8, 4);
    811     d5s8 = vdup_lane_s8(dtmps8, 5);
    812     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    813     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    814     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    815     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    816     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    817     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    818 
    819     // load src data
    820     src = src_ptr - src_pixels_per_line * 2;
    821     d18u8 = vld1_u8(src);
    822     src += src_pixels_per_line;
    823     d19u8 = vld1_u8(src);
    824     src += src_pixels_per_line;
    825     d20u8 = vld1_u8(src);
    826     src += src_pixels_per_line;
    827     d21u8 = vld1_u8(src);
    828     src += src_pixels_per_line;
    829     d22u8 = vld1_u8(src);
    830     src += src_pixels_per_line;
    831     d23u8 = vld1_u8(src);
    832     src += src_pixels_per_line;
    833     d24u8 = vld1_u8(src);
    834     src += src_pixels_per_line;
    835     d25u8 = vld1_u8(src);
    836     src += src_pixels_per_line;
    837     d26u8 = vld1_u8(src);
    838     src += src_pixels_per_line;
    839     d27u8 = vld1_u8(src);
    840     src += src_pixels_per_line;
    841     d28u8 = vld1_u8(src);
    842     src += src_pixels_per_line;
    843     d29u8 = vld1_u8(src);
    844     src += src_pixels_per_line;
    845     d30u8 = vld1_u8(src);
    846 
    847     for (i = 2; i > 0; i--) {
    848       q3u16 = vmull_u8(d18u8, d0u8);
    849       q4u16 = vmull_u8(d19u8, d0u8);
    850       q5u16 = vmull_u8(d20u8, d0u8);
    851       q6u16 = vmull_u8(d21u8, d0u8);
    852 
    853       q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
    854       q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
    855       q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
    856       q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
    857 
    858       q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
    859       q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
    860       q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
    861       q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
    862 
    863       q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
    864       q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
    865       q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
    866       q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
    867 
    868       q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
    869       q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
    870       q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
    871       q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
    872 
    873       q7u16 = vmull_u8(d21u8, d3u8);
    874       q8u16 = vmull_u8(d22u8, d3u8);
    875       q9u16 = vmull_u8(d23u8, d3u8);
    876       q10u16 = vmull_u8(d24u8, d3u8);
    877 
    878       q3s16 = vreinterpretq_s16_u16(q3u16);
    879       q4s16 = vreinterpretq_s16_u16(q4u16);
    880       q5s16 = vreinterpretq_s16_u16(q5u16);
    881       q6s16 = vreinterpretq_s16_u16(q6u16);
    882       q7s16 = vreinterpretq_s16_u16(q7u16);
    883       q8s16 = vreinterpretq_s16_u16(q8u16);
    884       q9s16 = vreinterpretq_s16_u16(q9u16);
    885       q10s16 = vreinterpretq_s16_u16(q10u16);
    886 
    887       q7s16 = vqaddq_s16(q7s16, q3s16);
    888       q8s16 = vqaddq_s16(q8s16, q4s16);
    889       q9s16 = vqaddq_s16(q9s16, q5s16);
    890       q10s16 = vqaddq_s16(q10s16, q6s16);
    891 
    892       d6u8 = vqrshrun_n_s16(q7s16, 7);
    893       d7u8 = vqrshrun_n_s16(q8s16, 7);
    894       d8u8 = vqrshrun_n_s16(q9s16, 7);
    895       d9u8 = vqrshrun_n_s16(q10s16, 7);
    896 
    897       d18u8 = d22u8;
    898       d19u8 = d23u8;
    899       d20u8 = d24u8;
    900       d21u8 = d25u8;
    901       d22u8 = d26u8;
    902       d23u8 = d27u8;
    903       d24u8 = d28u8;
    904       d25u8 = d29u8;
    905       d26u8 = d30u8;
    906 
    907       vst1_u8(dst_ptr, d6u8);
    908       dst_ptr += dst_pitch;
    909       vst1_u8(dst_ptr, d7u8);
    910       dst_ptr += dst_pitch;
    911       vst1_u8(dst_ptr, d8u8);
    912       dst_ptr += dst_pitch;
    913       vst1_u8(dst_ptr, d9u8);
    914       dst_ptr += dst_pitch;
    915     }
    916     return;
    917   }
    918 
    919   // load first_pass filter
    920   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    921   d0s8 = vdup_lane_s8(dtmps8, 0);
    922   d1s8 = vdup_lane_s8(dtmps8, 1);
    923   d2s8 = vdup_lane_s8(dtmps8, 2);
    924   d3s8 = vdup_lane_s8(dtmps8, 3);
    925   d4s8 = vdup_lane_s8(dtmps8, 4);
    926   d5s8 = vdup_lane_s8(dtmps8, 5);
    927   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    928   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    929   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    930   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    931   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    932   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    933 
    934   // First pass: output_height lines x output_width columns (9x4)
    935   if (yoffset == 0)  // firstpass_filter4x4_only
    936     src = src_ptr - 2;
    937   else
    938     src = src_ptr - 2 - (src_pixels_per_line * 2);
    939 
    940   tmpp = tmp;
    941   for (i = 2; i > 0; i--) {
    942     q3u8 = vld1q_u8(src);
    943     src += src_pixels_per_line;
    944     q4u8 = vld1q_u8(src);
    945     src += src_pixels_per_line;
    946     q5u8 = vld1q_u8(src);
    947     src += src_pixels_per_line;
    948     q6u8 = vld1q_u8(src);
    949     src += src_pixels_per_line;
    950 
    951     __builtin_prefetch(src);
    952     __builtin_prefetch(src + src_pixels_per_line);
    953     __builtin_prefetch(src + src_pixels_per_line * 2);
    954 
    955     q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    956     q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    957     q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    958     q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    959 
    960     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    961     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    962     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    963     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    964 
    965     q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
    966     q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
    967     q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
    968     q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
    969 
    970     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    971     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    972     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    973     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    974 
    975     q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
    976     q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
    977     q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
    978     q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
    979 
    980     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    981     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    982     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    983     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    984 
    985     q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
    986     q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
    987     q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
    988     q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
    989 
    990     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    991     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    992     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    993     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    994 
    995     q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
    996     q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
    997     q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
    998     q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
    999 
   1000     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
   1001     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
   1002     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
   1003     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
   1004 
   1005     q3u16 = vmull_u8(d28u8, d3u8);
   1006     q4u16 = vmull_u8(d29u8, d3u8);
   1007     q5u16 = vmull_u8(d30u8, d3u8);
   1008     q6u16 = vmull_u8(d31u8, d3u8);
   1009 
   1010     q3s16 = vreinterpretq_s16_u16(q3u16);
   1011     q4s16 = vreinterpretq_s16_u16(q4u16);
   1012     q5s16 = vreinterpretq_s16_u16(q5u16);
   1013     q6s16 = vreinterpretq_s16_u16(q6u16);
   1014     q7s16 = vreinterpretq_s16_u16(q7u16);
   1015     q8s16 = vreinterpretq_s16_u16(q8u16);
   1016     q9s16 = vreinterpretq_s16_u16(q9u16);
   1017     q10s16 = vreinterpretq_s16_u16(q10u16);
   1018 
   1019     q7s16 = vqaddq_s16(q7s16, q3s16);
   1020     q8s16 = vqaddq_s16(q8s16, q4s16);
   1021     q9s16 = vqaddq_s16(q9s16, q5s16);
   1022     q10s16 = vqaddq_s16(q10s16, q6s16);
   1023 
   1024     d22u8 = vqrshrun_n_s16(q7s16, 7);
   1025     d23u8 = vqrshrun_n_s16(q8s16, 7);
   1026     d24u8 = vqrshrun_n_s16(q9s16, 7);
   1027     d25u8 = vqrshrun_n_s16(q10s16, 7);
   1028 
   1029     if (yoffset == 0) {  // firstpass_filter8x4_only
   1030       vst1_u8(dst_ptr, d22u8);
   1031       dst_ptr += dst_pitch;
   1032       vst1_u8(dst_ptr, d23u8);
   1033       dst_ptr += dst_pitch;
   1034       vst1_u8(dst_ptr, d24u8);
   1035       dst_ptr += dst_pitch;
   1036       vst1_u8(dst_ptr, d25u8);
   1037       dst_ptr += dst_pitch;
   1038     } else {
   1039       vst1_u8(tmpp, d22u8);
   1040       tmpp += 8;
   1041       vst1_u8(tmpp, d23u8);
   1042       tmpp += 8;
   1043       vst1_u8(tmpp, d24u8);
   1044       tmpp += 8;
   1045       vst1_u8(tmpp, d25u8);
   1046       tmpp += 8;
   1047     }
   1048   }
   1049   if (yoffset == 0) return;
   1050 
   1051   // First Pass on rest 5-line data
   1052   q3u8 = vld1q_u8(src);
   1053   src += src_pixels_per_line;
   1054   q4u8 = vld1q_u8(src);
   1055   src += src_pixels_per_line;
   1056   q5u8 = vld1q_u8(src);
   1057   src += src_pixels_per_line;
   1058   q6u8 = vld1q_u8(src);
   1059   src += src_pixels_per_line;
   1060   q7u8 = vld1q_u8(src);
   1061 
   1062   q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
   1063   q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
   1064   q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
   1065   q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
   1066   q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
   1067 
   1068   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
   1069   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
   1070   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
   1071   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
   1072   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
   1073 
   1074   q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
   1075   q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
   1076   q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
   1077   q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
   1078   q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
   1079 
   1080   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
   1081   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
   1082   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
   1083   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
   1084   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
   1085 
   1086   q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
   1087   q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
   1088   q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
   1089   q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
   1090   q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
   1091 
   1092   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
   1093   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
   1094   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
   1095   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
   1096   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
   1097 
   1098   q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
   1099   q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
   1100   q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
   1101   q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
   1102   q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
   1103 
   1104   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
   1105   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
   1106   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
   1107   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
   1108   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
   1109 
   1110   q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
   1111   q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
   1112   q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
   1113   q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
   1114   q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
   1115 
   1116   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
   1117   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
   1118   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
   1119   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
   1120   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
   1121 
   1122   q3u16 = vmull_u8(d27u8, d3u8);
   1123   q4u16 = vmull_u8(d28u8, d3u8);
   1124   q5u16 = vmull_u8(d29u8, d3u8);
   1125   q6u16 = vmull_u8(d30u8, d3u8);
   1126   q7u16 = vmull_u8(d31u8, d3u8);
   1127 
   1128   q3s16 = vreinterpretq_s16_u16(q3u16);
   1129   q4s16 = vreinterpretq_s16_u16(q4u16);
   1130   q5s16 = vreinterpretq_s16_u16(q5u16);
   1131   q6s16 = vreinterpretq_s16_u16(q6u16);
   1132   q7s16 = vreinterpretq_s16_u16(q7u16);
   1133   q8s16 = vreinterpretq_s16_u16(q8u16);
   1134   q9s16 = vreinterpretq_s16_u16(q9u16);
   1135   q10s16 = vreinterpretq_s16_u16(q10u16);
   1136   q11s16 = vreinterpretq_s16_u16(q11u16);
   1137   q12s16 = vreinterpretq_s16_u16(q12u16);
   1138 
   1139   q8s16 = vqaddq_s16(q8s16, q3s16);
   1140   q9s16 = vqaddq_s16(q9s16, q4s16);
   1141   q10s16 = vqaddq_s16(q10s16, q5s16);
   1142   q11s16 = vqaddq_s16(q11s16, q6s16);
   1143   q12s16 = vqaddq_s16(q12s16, q7s16);
   1144 
   1145   d26u8 = vqrshrun_n_s16(q8s16, 7);
   1146   d27u8 = vqrshrun_n_s16(q9s16, 7);
   1147   d28u8 = vqrshrun_n_s16(q10s16, 7);
   1148   d29u8 = vqrshrun_n_s16(q11s16, 7);
   1149   d30u8 = vqrshrun_n_s16(q12s16, 7);
   1150 
   1151   // Second pass: 8x8
   1152   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1153   d0s8 = vdup_lane_s8(dtmps8, 0);
   1154   d1s8 = vdup_lane_s8(dtmps8, 1);
   1155   d2s8 = vdup_lane_s8(dtmps8, 2);
   1156   d3s8 = vdup_lane_s8(dtmps8, 3);
   1157   d4s8 = vdup_lane_s8(dtmps8, 4);
   1158   d5s8 = vdup_lane_s8(dtmps8, 5);
   1159   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1160   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1161   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1162   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1163   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1164   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1165 
   1166   tmpp = tmp;
   1167   q9u8 = vld1q_u8(tmpp);
   1168   tmpp += 16;
   1169   q10u8 = vld1q_u8(tmpp);
   1170   tmpp += 16;
   1171   q11u8 = vld1q_u8(tmpp);
   1172   tmpp += 16;
   1173   q12u8 = vld1q_u8(tmpp);
   1174 
   1175   d18u8 = vget_low_u8(q9u8);
   1176   d19u8 = vget_high_u8(q9u8);
   1177   d20u8 = vget_low_u8(q10u8);
   1178   d21u8 = vget_high_u8(q10u8);
   1179   d22u8 = vget_low_u8(q11u8);
   1180   d23u8 = vget_high_u8(q11u8);
   1181   d24u8 = vget_low_u8(q12u8);
   1182   d25u8 = vget_high_u8(q12u8);
   1183 
   1184   for (i = 2; i > 0; i--) {
   1185     q3u16 = vmull_u8(d18u8, d0u8);
   1186     q4u16 = vmull_u8(d19u8, d0u8);
   1187     q5u16 = vmull_u8(d20u8, d0u8);
   1188     q6u16 = vmull_u8(d21u8, d0u8);
   1189 
   1190     q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1191     q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1192     q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1193     q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1194 
   1195     q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1196     q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1197     q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1198     q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1199 
   1200     q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1201     q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1202     q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1203     q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1204 
   1205     q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1206     q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1207     q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1208     q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1209 
   1210     q7u16 = vmull_u8(d21u8, d3u8);
   1211     q8u16 = vmull_u8(d22u8, d3u8);
   1212     q9u16 = vmull_u8(d23u8, d3u8);
   1213     q10u16 = vmull_u8(d24u8, d3u8);
   1214 
   1215     q3s16 = vreinterpretq_s16_u16(q3u16);
   1216     q4s16 = vreinterpretq_s16_u16(q4u16);
   1217     q5s16 = vreinterpretq_s16_u16(q5u16);
   1218     q6s16 = vreinterpretq_s16_u16(q6u16);
   1219     q7s16 = vreinterpretq_s16_u16(q7u16);
   1220     q8s16 = vreinterpretq_s16_u16(q8u16);
   1221     q9s16 = vreinterpretq_s16_u16(q9u16);
   1222     q10s16 = vreinterpretq_s16_u16(q10u16);
   1223 
   1224     q7s16 = vqaddq_s16(q7s16, q3s16);
   1225     q8s16 = vqaddq_s16(q8s16, q4s16);
   1226     q9s16 = vqaddq_s16(q9s16, q5s16);
   1227     q10s16 = vqaddq_s16(q10s16, q6s16);
   1228 
   1229     d6u8 = vqrshrun_n_s16(q7s16, 7);
   1230     d7u8 = vqrshrun_n_s16(q8s16, 7);
   1231     d8u8 = vqrshrun_n_s16(q9s16, 7);
   1232     d9u8 = vqrshrun_n_s16(q10s16, 7);
   1233 
   1234     d18u8 = d22u8;
   1235     d19u8 = d23u8;
   1236     d20u8 = d24u8;
   1237     d21u8 = d25u8;
   1238     d22u8 = d26u8;
   1239     d23u8 = d27u8;
   1240     d24u8 = d28u8;
   1241     d25u8 = d29u8;
   1242     d26u8 = d30u8;
   1243 
   1244     vst1_u8(dst_ptr, d6u8);
   1245     dst_ptr += dst_pitch;
   1246     vst1_u8(dst_ptr, d7u8);
   1247     dst_ptr += dst_pitch;
   1248     vst1_u8(dst_ptr, d8u8);
   1249     dst_ptr += dst_pitch;
   1250     vst1_u8(dst_ptr, d9u8);
   1251     dst_ptr += dst_pitch;
   1252   }
   1253   return;
   1254 }
   1255 
   1256 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
   1257                                   int src_pixels_per_line, int xoffset,
   1258                                   int yoffset, unsigned char *dst_ptr,
   1259                                   int dst_pitch) {
   1260   unsigned char *src, *src_tmp, *dst, *tmpp;
   1261   unsigned char tmp[336];
   1262   int i, j;
   1263   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
   1264   uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
   1265   uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
   1266   uint8x8_t d28u8, d29u8, d30u8, d31u8;
   1267   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
   1268   uint8x16_t q3u8, q4u8;
   1269   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
   1270   uint16x8_t q11u16, q12u16, q13u16, q15u16;
   1271   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
   1272   int16x8_t q11s16, q12s16, q13s16, q15s16;
   1273 
   1274   if (xoffset == 0) {  // secondpass_filter8x8_only
   1275     // load second_pass filter
   1276     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1277     d0s8 = vdup_lane_s8(dtmps8, 0);
   1278     d1s8 = vdup_lane_s8(dtmps8, 1);
   1279     d2s8 = vdup_lane_s8(dtmps8, 2);
   1280     d3s8 = vdup_lane_s8(dtmps8, 3);
   1281     d4s8 = vdup_lane_s8(dtmps8, 4);
   1282     d5s8 = vdup_lane_s8(dtmps8, 5);
   1283     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1284     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1285     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1286     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1287     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1288     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1289 
   1290     // load src data
   1291     src_tmp = src_ptr - src_pixels_per_line * 2;
   1292     for (i = 0; i < 2; ++i) {
   1293       src = src_tmp + i * 8;
   1294       dst = dst_ptr + i * 8;
   1295       d18u8 = vld1_u8(src);
   1296       src += src_pixels_per_line;
   1297       d19u8 = vld1_u8(src);
   1298       src += src_pixels_per_line;
   1299       d20u8 = vld1_u8(src);
   1300       src += src_pixels_per_line;
   1301       d21u8 = vld1_u8(src);
   1302       src += src_pixels_per_line;
   1303       d22u8 = vld1_u8(src);
   1304       src += src_pixels_per_line;
   1305       for (j = 0; j < 4; ++j) {
   1306         d23u8 = vld1_u8(src);
   1307         src += src_pixels_per_line;
   1308         d24u8 = vld1_u8(src);
   1309         src += src_pixels_per_line;
   1310         d25u8 = vld1_u8(src);
   1311         src += src_pixels_per_line;
   1312         d26u8 = vld1_u8(src);
   1313         src += src_pixels_per_line;
   1314 
   1315         q3u16 = vmull_u8(d18u8, d0u8);
   1316         q4u16 = vmull_u8(d19u8, d0u8);
   1317         q5u16 = vmull_u8(d20u8, d0u8);
   1318         q6u16 = vmull_u8(d21u8, d0u8);
   1319 
   1320         q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1321         q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1322         q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1323         q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1324 
   1325         q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1326         q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1327         q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1328         q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1329 
   1330         q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1331         q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1332         q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1333         q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1334 
   1335         q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1336         q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1337         q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1338         q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1339 
   1340         q7u16 = vmull_u8(d21u8, d3u8);
   1341         q8u16 = vmull_u8(d22u8, d3u8);
   1342         q9u16 = vmull_u8(d23u8, d3u8);
   1343         q10u16 = vmull_u8(d24u8, d3u8);
   1344 
   1345         q3s16 = vreinterpretq_s16_u16(q3u16);
   1346         q4s16 = vreinterpretq_s16_u16(q4u16);
   1347         q5s16 = vreinterpretq_s16_u16(q5u16);
   1348         q6s16 = vreinterpretq_s16_u16(q6u16);
   1349         q7s16 = vreinterpretq_s16_u16(q7u16);
   1350         q8s16 = vreinterpretq_s16_u16(q8u16);
   1351         q9s16 = vreinterpretq_s16_u16(q9u16);
   1352         q10s16 = vreinterpretq_s16_u16(q10u16);
   1353 
   1354         q7s16 = vqaddq_s16(q7s16, q3s16);
   1355         q8s16 = vqaddq_s16(q8s16, q4s16);
   1356         q9s16 = vqaddq_s16(q9s16, q5s16);
   1357         q10s16 = vqaddq_s16(q10s16, q6s16);
   1358 
   1359         d6u8 = vqrshrun_n_s16(q7s16, 7);
   1360         d7u8 = vqrshrun_n_s16(q8s16, 7);
   1361         d8u8 = vqrshrun_n_s16(q9s16, 7);
   1362         d9u8 = vqrshrun_n_s16(q10s16, 7);
   1363 
   1364         d18u8 = d22u8;
   1365         d19u8 = d23u8;
   1366         d20u8 = d24u8;
   1367         d21u8 = d25u8;
   1368         d22u8 = d26u8;
   1369 
   1370         vst1_u8(dst, d6u8);
   1371         dst += dst_pitch;
   1372         vst1_u8(dst, d7u8);
   1373         dst += dst_pitch;
   1374         vst1_u8(dst, d8u8);
   1375         dst += dst_pitch;
   1376         vst1_u8(dst, d9u8);
   1377         dst += dst_pitch;
   1378       }
   1379     }
   1380     return;
   1381   }
   1382 
   1383   // load first_pass filter
   1384   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
   1385   d0s8 = vdup_lane_s8(dtmps8, 0);
   1386   d1s8 = vdup_lane_s8(dtmps8, 1);
   1387   d2s8 = vdup_lane_s8(dtmps8, 2);
   1388   d3s8 = vdup_lane_s8(dtmps8, 3);
   1389   d4s8 = vdup_lane_s8(dtmps8, 4);
   1390   d5s8 = vdup_lane_s8(dtmps8, 5);
   1391   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1392   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1393   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1394   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1395   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1396   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1397 
   1398   // First pass: output_height lines x output_width columns (9x4)
   1399   if (yoffset == 0) {  // firstpass_filter4x4_only
   1400     src = src_ptr - 2;
   1401     dst = dst_ptr;
   1402     for (i = 0; i < 8; ++i) {
   1403       d6u8 = vld1_u8(src);
   1404       d7u8 = vld1_u8(src + 8);
   1405       d8u8 = vld1_u8(src + 16);
   1406       src += src_pixels_per_line;
   1407       d9u8 = vld1_u8(src);
   1408       d10u8 = vld1_u8(src + 8);
   1409       d11u8 = vld1_u8(src + 16);
   1410       src += src_pixels_per_line;
   1411 
   1412       __builtin_prefetch(src);
   1413       __builtin_prefetch(src + src_pixels_per_line);
   1414 
   1415       q6u16 = vmull_u8(d6u8, d0u8);
   1416       q7u16 = vmull_u8(d7u8, d0u8);
   1417       q8u16 = vmull_u8(d9u8, d0u8);
   1418       q9u16 = vmull_u8(d10u8, d0u8);
   1419 
   1420       d20u8 = vext_u8(d6u8, d7u8, 1);
   1421       d21u8 = vext_u8(d9u8, d10u8, 1);
   1422       d22u8 = vext_u8(d7u8, d8u8, 1);
   1423       d23u8 = vext_u8(d10u8, d11u8, 1);
   1424       d24u8 = vext_u8(d6u8, d7u8, 4);
   1425       d25u8 = vext_u8(d9u8, d10u8, 4);
   1426       d26u8 = vext_u8(d7u8, d8u8, 4);
   1427       d27u8 = vext_u8(d10u8, d11u8, 4);
   1428       d28u8 = vext_u8(d6u8, d7u8, 5);
   1429       d29u8 = vext_u8(d9u8, d10u8, 5);
   1430 
   1431       q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
   1432       q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
   1433       q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
   1434       q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
   1435       q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
   1436       q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
   1437       q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
   1438       q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
   1439       q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
   1440       q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
   1441 
   1442       d20u8 = vext_u8(d7u8, d8u8, 5);
   1443       d21u8 = vext_u8(d10u8, d11u8, 5);
   1444       d22u8 = vext_u8(d6u8, d7u8, 2);
   1445       d23u8 = vext_u8(d9u8, d10u8, 2);
   1446       d24u8 = vext_u8(d7u8, d8u8, 2);
   1447       d25u8 = vext_u8(d10u8, d11u8, 2);
   1448       d26u8 = vext_u8(d6u8, d7u8, 3);
   1449       d27u8 = vext_u8(d9u8, d10u8, 3);
   1450       d28u8 = vext_u8(d7u8, d8u8, 3);
   1451       d29u8 = vext_u8(d10u8, d11u8, 3);
   1452 
   1453       q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
   1454       q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
   1455       q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
   1456       q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
   1457       q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
   1458       q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
   1459 
   1460       q10u16 = vmull_u8(d26u8, d3u8);
   1461       q11u16 = vmull_u8(d27u8, d3u8);
   1462       q12u16 = vmull_u8(d28u8, d3u8);
   1463       q15u16 = vmull_u8(d29u8, d3u8);
   1464 
   1465       q6s16 = vreinterpretq_s16_u16(q6u16);
   1466       q7s16 = vreinterpretq_s16_u16(q7u16);
   1467       q8s16 = vreinterpretq_s16_u16(q8u16);
   1468       q9s16 = vreinterpretq_s16_u16(q9u16);
   1469       q10s16 = vreinterpretq_s16_u16(q10u16);
   1470       q11s16 = vreinterpretq_s16_u16(q11u16);
   1471       q12s16 = vreinterpretq_s16_u16(q12u16);
   1472       q15s16 = vreinterpretq_s16_u16(q15u16);
   1473 
   1474       q6s16 = vqaddq_s16(q6s16, q10s16);
   1475       q8s16 = vqaddq_s16(q8s16, q11s16);
   1476       q7s16 = vqaddq_s16(q7s16, q12s16);
   1477       q9s16 = vqaddq_s16(q9s16, q15s16);
   1478 
   1479       d6u8 = vqrshrun_n_s16(q6s16, 7);
   1480       d7u8 = vqrshrun_n_s16(q7s16, 7);
   1481       d8u8 = vqrshrun_n_s16(q8s16, 7);
   1482       d9u8 = vqrshrun_n_s16(q9s16, 7);
   1483 
   1484       q3u8 = vcombine_u8(d6u8, d7u8);
   1485       q4u8 = vcombine_u8(d8u8, d9u8);
   1486       vst1q_u8(dst, q3u8);
   1487       dst += dst_pitch;
   1488       vst1q_u8(dst, q4u8);
   1489       dst += dst_pitch;
   1490     }
   1491     return;
   1492   }
   1493 
   1494   src = src_ptr - 2 - src_pixels_per_line * 2;
   1495   tmpp = tmp;
   1496   for (i = 0; i < 7; ++i) {
   1497     d6u8 = vld1_u8(src);
   1498     d7u8 = vld1_u8(src + 8);
   1499     d8u8 = vld1_u8(src + 16);
   1500     src += src_pixels_per_line;
   1501     d9u8 = vld1_u8(src);
   1502     d10u8 = vld1_u8(src + 8);
   1503     d11u8 = vld1_u8(src + 16);
   1504     src += src_pixels_per_line;
   1505     d12u8 = vld1_u8(src);
   1506     d13u8 = vld1_u8(src + 8);
   1507     d14u8 = vld1_u8(src + 16);
   1508     src += src_pixels_per_line;
   1509 
   1510     __builtin_prefetch(src);
   1511     __builtin_prefetch(src + src_pixels_per_line);
   1512     __builtin_prefetch(src + src_pixels_per_line * 2);
   1513 
   1514     q8u16 = vmull_u8(d6u8, d0u8);
   1515     q9u16 = vmull_u8(d7u8, d0u8);
   1516     q10u16 = vmull_u8(d9u8, d0u8);
   1517     q11u16 = vmull_u8(d10u8, d0u8);
   1518     q12u16 = vmull_u8(d12u8, d0u8);
   1519     q13u16 = vmull_u8(d13u8, d0u8);
   1520 
   1521     d28u8 = vext_u8(d6u8, d7u8, 1);
   1522     d29u8 = vext_u8(d9u8, d10u8, 1);
   1523     d30u8 = vext_u8(d12u8, d13u8, 1);
   1524     q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
   1525     q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
   1526     q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
   1527     d28u8 = vext_u8(d7u8, d8u8, 1);
   1528     d29u8 = vext_u8(d10u8, d11u8, 1);
   1529     d30u8 = vext_u8(d13u8, d14u8, 1);
   1530     q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
   1531     q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
   1532     q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
   1533 
   1534     d28u8 = vext_u8(d6u8, d7u8, 4);
   1535     d29u8 = vext_u8(d9u8, d10u8, 4);
   1536     d30u8 = vext_u8(d12u8, d13u8, 4);
   1537     q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
   1538     q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
   1539     q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
   1540     d28u8 = vext_u8(d7u8, d8u8, 4);
   1541     d29u8 = vext_u8(d10u8, d11u8, 4);
   1542     d30u8 = vext_u8(d13u8, d14u8, 4);
   1543     q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
   1544     q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
   1545     q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
   1546 
   1547     d28u8 = vext_u8(d6u8, d7u8, 5);
   1548     d29u8 = vext_u8(d9u8, d10u8, 5);
   1549     d30u8 = vext_u8(d12u8, d13u8, 5);
   1550     q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
   1551     q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
   1552     q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
   1553     d28u8 = vext_u8(d7u8, d8u8, 5);
   1554     d29u8 = vext_u8(d10u8, d11u8, 5);
   1555     d30u8 = vext_u8(d13u8, d14u8, 5);
   1556     q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
   1557     q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
   1558     q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
   1559 
   1560     d28u8 = vext_u8(d6u8, d7u8, 2);
   1561     d29u8 = vext_u8(d9u8, d10u8, 2);
   1562     d30u8 = vext_u8(d12u8, d13u8, 2);
   1563     q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
   1564     q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
   1565     q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
   1566     d28u8 = vext_u8(d7u8, d8u8, 2);
   1567     d29u8 = vext_u8(d10u8, d11u8, 2);
   1568     d30u8 = vext_u8(d13u8, d14u8, 2);
   1569     q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
   1570     q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
   1571     q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
   1572 
   1573     d28u8 = vext_u8(d6u8, d7u8, 3);
   1574     d29u8 = vext_u8(d9u8, d10u8, 3);
   1575     d30u8 = vext_u8(d12u8, d13u8, 3);
   1576     d15u8 = vext_u8(d7u8, d8u8, 3);
   1577     d31u8 = vext_u8(d10u8, d11u8, 3);
   1578     d6u8 = vext_u8(d13u8, d14u8, 3);
   1579     q4u16 = vmull_u8(d28u8, d3u8);
   1580     q5u16 = vmull_u8(d29u8, d3u8);
   1581     q6u16 = vmull_u8(d30u8, d3u8);
   1582     q4s16 = vreinterpretq_s16_u16(q4u16);
   1583     q5s16 = vreinterpretq_s16_u16(q5u16);
   1584     q6s16 = vreinterpretq_s16_u16(q6u16);
   1585     q8s16 = vreinterpretq_s16_u16(q8u16);
   1586     q10s16 = vreinterpretq_s16_u16(q10u16);
   1587     q12s16 = vreinterpretq_s16_u16(q12u16);
   1588     q8s16 = vqaddq_s16(q8s16, q4s16);
   1589     q10s16 = vqaddq_s16(q10s16, q5s16);
   1590     q12s16 = vqaddq_s16(q12s16, q6s16);
   1591 
   1592     q6u16 = vmull_u8(d15u8, d3u8);
   1593     q7u16 = vmull_u8(d31u8, d3u8);
   1594     q3u16 = vmull_u8(d6u8, d3u8);
   1595     q3s16 = vreinterpretq_s16_u16(q3u16);
   1596     q6s16 = vreinterpretq_s16_u16(q6u16);
   1597     q7s16 = vreinterpretq_s16_u16(q7u16);
   1598     q9s16 = vreinterpretq_s16_u16(q9u16);
   1599     q11s16 = vreinterpretq_s16_u16(q11u16);
   1600     q13s16 = vreinterpretq_s16_u16(q13u16);
   1601     q9s16 = vqaddq_s16(q9s16, q6s16);
   1602     q11s16 = vqaddq_s16(q11s16, q7s16);
   1603     q13s16 = vqaddq_s16(q13s16, q3s16);
   1604 
   1605     d6u8 = vqrshrun_n_s16(q8s16, 7);
   1606     d7u8 = vqrshrun_n_s16(q9s16, 7);
   1607     d8u8 = vqrshrun_n_s16(q10s16, 7);
   1608     d9u8 = vqrshrun_n_s16(q11s16, 7);
   1609     d10u8 = vqrshrun_n_s16(q12s16, 7);
   1610     d11u8 = vqrshrun_n_s16(q13s16, 7);
   1611 
   1612     vst1_u8(tmpp, d6u8);
   1613     tmpp += 8;
   1614     vst1_u8(tmpp, d7u8);
   1615     tmpp += 8;
   1616     vst1_u8(tmpp, d8u8);
   1617     tmpp += 8;
   1618     vst1_u8(tmpp, d9u8);
   1619     tmpp += 8;
   1620     vst1_u8(tmpp, d10u8);
   1621     tmpp += 8;
   1622     vst1_u8(tmpp, d11u8);
   1623     tmpp += 8;
   1624   }
   1625 
   1626   // Second pass: 16x16
   1627   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1628   d0s8 = vdup_lane_s8(dtmps8, 0);
   1629   d1s8 = vdup_lane_s8(dtmps8, 1);
   1630   d2s8 = vdup_lane_s8(dtmps8, 2);
   1631   d3s8 = vdup_lane_s8(dtmps8, 3);
   1632   d4s8 = vdup_lane_s8(dtmps8, 4);
   1633   d5s8 = vdup_lane_s8(dtmps8, 5);
   1634   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1635   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1636   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1637   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1638   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1639   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1640 
   1641   for (i = 0; i < 2; ++i) {
   1642     dst = dst_ptr + 8 * i;
   1643     tmpp = tmp + 8 * i;
   1644     d18u8 = vld1_u8(tmpp);
   1645     tmpp += 16;
   1646     d19u8 = vld1_u8(tmpp);
   1647     tmpp += 16;
   1648     d20u8 = vld1_u8(tmpp);
   1649     tmpp += 16;
   1650     d21u8 = vld1_u8(tmpp);
   1651     tmpp += 16;
   1652     d22u8 = vld1_u8(tmpp);
   1653     tmpp += 16;
   1654     for (j = 0; j < 4; ++j) {
   1655       d23u8 = vld1_u8(tmpp);
   1656       tmpp += 16;
   1657       d24u8 = vld1_u8(tmpp);
   1658       tmpp += 16;
   1659       d25u8 = vld1_u8(tmpp);
   1660       tmpp += 16;
   1661       d26u8 = vld1_u8(tmpp);
   1662       tmpp += 16;
   1663 
   1664       q3u16 = vmull_u8(d18u8, d0u8);
   1665       q4u16 = vmull_u8(d19u8, d0u8);
   1666       q5u16 = vmull_u8(d20u8, d0u8);
   1667       q6u16 = vmull_u8(d21u8, d0u8);
   1668 
   1669       q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1670       q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1671       q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1672       q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1673 
   1674       q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1675       q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1676       q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1677       q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1678 
   1679       q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1680       q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1681       q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1682       q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1683 
   1684       q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1685       q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1686       q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1687       q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1688 
   1689       q7u16 = vmull_u8(d21u8, d3u8);
   1690       q8u16 = vmull_u8(d22u8, d3u8);
   1691       q9u16 = vmull_u8(d23u8, d3u8);
   1692       q10u16 = vmull_u8(d24u8, d3u8);
   1693 
   1694       q3s16 = vreinterpretq_s16_u16(q3u16);
   1695       q4s16 = vreinterpretq_s16_u16(q4u16);
   1696       q5s16 = vreinterpretq_s16_u16(q5u16);
   1697       q6s16 = vreinterpretq_s16_u16(q6u16);
   1698       q7s16 = vreinterpretq_s16_u16(q7u16);
   1699       q8s16 = vreinterpretq_s16_u16(q8u16);
   1700       q9s16 = vreinterpretq_s16_u16(q9u16);
   1701       q10s16 = vreinterpretq_s16_u16(q10u16);
   1702 
   1703       q7s16 = vqaddq_s16(q7s16, q3s16);
   1704       q8s16 = vqaddq_s16(q8s16, q4s16);
   1705       q9s16 = vqaddq_s16(q9s16, q5s16);
   1706       q10s16 = vqaddq_s16(q10s16, q6s16);
   1707 
   1708       d6u8 = vqrshrun_n_s16(q7s16, 7);
   1709       d7u8 = vqrshrun_n_s16(q8s16, 7);
   1710       d8u8 = vqrshrun_n_s16(q9s16, 7);
   1711       d9u8 = vqrshrun_n_s16(q10s16, 7);
   1712 
   1713       d18u8 = d22u8;
   1714       d19u8 = d23u8;
   1715       d20u8 = d24u8;
   1716       d21u8 = d25u8;
   1717       d22u8 = d26u8;
   1718 
   1719       vst1_u8(dst, d6u8);
   1720       dst += dst_pitch;
   1721       vst1_u8(dst, d7u8);
   1722       dst += dst_pitch;
   1723       vst1_u8(dst, d8u8);
   1724       dst += dst_pitch;
   1725       vst1_u8(dst, d9u8);
   1726       dst += dst_pitch;
   1727     }
   1728   }
   1729   return;
   1730 }
   1731