Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include <string.h>
     13 #include "./vpx_config.h"
     14 #include "vpx_dsp/arm/mem_neon.h"
     15 #include "vpx_ports/mem.h"
     16 
     17 static const int8_t vp8_sub_pel_filters[8][8] = {
     18   { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
     19   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
     20   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
     21   { 0, -9, 93, 50, -6, 0, 0, 0 },
     22   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
     23   { 0, -6, 50, 93, -9, 0, 0, 0 },
     24   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
     25   { 0, -1, 12, 123, -6, 0, 0, 0 },
     26 };
     27 
     28 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
     29 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
     30 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
     31 // multiply/accumulates which either add or subtract as needed. The other
     32 // functions will be updated to use this table later.
     33 // It is also expanded to 8 elements to allow loading into 64 bit neon
     34 // registers.
     35 static const uint8_t abs_filters[8][8] = {
     36   { 0, 0, 128, 0, 0, 0, 0, 0 },   { 0, 6, 123, 12, 1, 0, 0, 0 },
     37   { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
     38   { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
     39   { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
     40 };
     41 
     42 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
     43   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
     44 }
     45 
     46 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
     47                                          const uint8x8_t filter, uint16x8_t *c,
     48                                          uint16x8_t *d) {
     49   const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
     50                                        vreinterpret_u32_u8(vget_high_u8(a)));
     51   const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
     52                                        vreinterpret_u32_u8(vget_high_u8(b)));
     53   *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
     54   *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
     55 }
     56 
     57 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
     58                                          const uint8x8_t filter, uint16x8_t *c,
     59                                          uint16x8_t *d) {
     60   const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
     61                                        vreinterpret_u32_u8(vget_high_u8(a)));
     62   const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
     63                                        vreinterpret_u32_u8(vget_high_u8(b)));
     64   *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
     65   *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
     66 }
     67 
     68 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
     69                             int filter_offset, unsigned char *dst,
     70                             int dst_stride) {
     71   uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
     72   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
     73   uint16x8_t c0, c1, c2, c3;
     74   int16x8_t d0, d1;
     75   uint8x8_t e0, e1;
     76 
     77   const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
     78   const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
     79   const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
     80   const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
     81   const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
     82   const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
     83   const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
     84 
     85   src -= src_stride * 2;
     86   // Shift the even rows to allow using 'vext' to combine the vectors. armv8
     87   // has vcopy_lane which would be interesting. This started as just a
     88   // horrible workaround for clang adding alignment hints to 32bit loads:
     89   // https://llvm.org/bugs/show_bug.cgi?id=24421
     90   // But it turns out it almost identical to casting the loads.
     91   a0 = load_and_shift(src);
     92   src += src_stride;
     93   a1 = vld1_u8(src);
     94   src += src_stride;
     95   a2 = load_and_shift(src);
     96   src += src_stride;
     97   a3 = vld1_u8(src);
     98   src += src_stride;
     99   a4 = load_and_shift(src);
    100   src += src_stride;
    101   a5 = vld1_u8(src);
    102   src += src_stride;
    103   a6 = load_and_shift(src);
    104   src += src_stride;
    105   a7 = vld1_u8(src);
    106   src += src_stride;
    107   a8 = vld1_u8(src);
    108 
    109   // Combine the rows so we can operate on 8 at a time.
    110   b0 = vext_u8(a0, a1, 4);
    111   b2 = vext_u8(a2, a3, 4);
    112   b4 = vext_u8(a4, a5, 4);
    113   b6 = vext_u8(a6, a7, 4);
    114   b8 = a8;
    115 
    116   // To keep with the 8-at-a-time theme, combine *alternate* rows. This
    117   // allows combining the odd rows with the even.
    118   b1 = vext_u8(b0, b2, 4);
    119   b3 = vext_u8(b2, b4, 4);
    120   b5 = vext_u8(b4, b6, 4);
    121   b7 = vext_u8(b6, b8, 4);
    122 
    123   // Multiply and expand to 16 bits.
    124   c0 = vmull_u8(b0, filter0);
    125   c1 = vmull_u8(b2, filter0);
    126   c2 = vmull_u8(b5, filter5);
    127   c3 = vmull_u8(b7, filter5);
    128 
    129   // Multiply, subtract and accumulate for filters 1 and 4 (the negative
    130   // ones).
    131   c0 = vmlsl_u8(c0, b4, filter4);
    132   c1 = vmlsl_u8(c1, b6, filter4);
    133   c2 = vmlsl_u8(c2, b1, filter1);
    134   c3 = vmlsl_u8(c3, b3, filter1);
    135 
    136   // Add more positive ones. vmlal should really return a signed type.
    137   // It's doing signed math internally, as evidenced by the fact we can do
    138   // subtractions followed by more additions. Ideally we could use
    139   // vqmlal/sl but that instruction doesn't exist. Might be able to
    140   // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
    141   c0 = vmlal_u8(c0, b2, filter2);
    142   c1 = vmlal_u8(c1, b4, filter2);
    143   c2 = vmlal_u8(c2, b3, filter3);
    144   c3 = vmlal_u8(c3, b5, filter3);
    145 
    146   // Use signed saturation math because vmlsl may have left some negative
    147   // numbers in there.
    148   d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
    149   d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
    150 
    151   // Use signed again because numbers like -200 need to be saturated to 0.
    152   e0 = vqrshrun_n_s16(d0, 7);
    153   e1 = vqrshrun_n_s16(d1, 7);
    154 
    155   store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
    156 }
    157 
    158 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
    159                                 int xoffset, int yoffset,
    160                                 unsigned char *dst_ptr, int dst_pitch) {
    161   uint8x16_t s0, s1, s2, s3, s4;
    162   uint64x2_t s01, s23;
    163   // Variables to hold src[] elements for the given filter[]
    164   uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
    165   uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
    166   uint8x16_t s01_f0, s23_f0;
    167   uint64x2_t s01_f3, s23_f3;
    168   uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
    169   // Accumulator variables.
    170   uint16x8_t d0123, d4567, d89;
    171   uint16x8_t d0123_a, d4567_a, d89_a;
    172   int16x8_t e0123, e4567, e89;
    173   // Second pass intermediates.
    174   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
    175   uint16x8_t c0, c1, c2, c3;
    176   int16x8_t d0, d1;
    177   uint8x8_t e0, e1;
    178   uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
    179 
    180   if (xoffset == 0) {  // Second pass only.
    181     yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
    182     return;
    183   }
    184 
    185   if (yoffset == 0) {  // First pass only.
    186     src_ptr -= 2;
    187   } else {  // Add context for the second pass. 2 extra lines on top.
    188     src_ptr -= 2 + (src_pixels_per_line * 2);
    189   }
    190 
    191   filter = vld1_u8(abs_filters[xoffset]);
    192   filter0 = vdup_lane_u8(filter, 0);
    193   filter1 = vdup_lane_u8(filter, 1);
    194   filter2 = vdup_lane_u8(filter, 2);
    195   filter3 = vdup_lane_u8(filter, 3);
    196   filter4 = vdup_lane_u8(filter, 4);
    197   filter5 = vdup_lane_u8(filter, 5);
    198 
    199   // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
    200   // garbage. So much effort for that last single bit.
    201   // The low values of each pair are for filter0.
    202   s0 = vld1q_u8(src_ptr);
    203   src_ptr += src_pixels_per_line;
    204   s1 = vld1q_u8(src_ptr);
    205   src_ptr += src_pixels_per_line;
    206   s2 = vld1q_u8(src_ptr);
    207   src_ptr += src_pixels_per_line;
    208   s3 = vld1q_u8(src_ptr);
    209   src_ptr += src_pixels_per_line;
    210 
    211   // Shift to extract values for filter[5]
    212   // If src[] is 0, this puts:
    213   // 3 4 5 6 7 8 9 10 in s0_f5
    214   // Can't use vshr.u64 because it crosses the double word boundary.
    215   s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
    216   s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
    217   s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
    218   s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
    219 
    220   s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
    221   s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
    222 
    223   s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
    224   s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
    225   d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
    226   d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
    227 
    228   // Keep original src data as 64 bits to simplify shifting and extracting.
    229   s01 = vreinterpretq_u64_u8(s01_f0);
    230   s23 = vreinterpretq_u64_u8(s23_f0);
    231 
    232   // 3 4 5 6 * filter0
    233   filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
    234 
    235   // Shift over one to use -1, 0, 1, 2 for filter1
    236   // -1 0 1 2 * filter1
    237   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
    238                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
    239                         &d0123, &d4567);
    240 
    241   // 2 3 4 5 * filter4
    242   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
    243                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
    244                         &d0123, &d4567);
    245 
    246   // 0 1 2 3 * filter2
    247   filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
    248                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
    249                         &d0123, &d4567);
    250 
    251   // 1 2 3 4 * filter3
    252   s01_f3 = vshrq_n_u64(s01, 24);
    253   s23_f3 = vshrq_n_u64(s23, 24);
    254   s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
    255                       vreinterpret_u32_u64(vget_high_u64(s01_f3)));
    256   s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
    257                       vreinterpret_u32_u64(vget_high_u64(s23_f3)));
    258   // Accumulate into different registers so it can use saturated addition.
    259   d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
    260   d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
    261 
    262   e0123 =
    263       vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
    264   e4567 =
    265       vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
    266 
    267   // Shift and narrow.
    268   b0 = vqrshrun_n_s16(e0123, 7);
    269   b2 = vqrshrun_n_s16(e4567, 7);
    270 
    271   if (yoffset == 0) {  // firstpass_filter4x4_only
    272     store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
    273     return;
    274   }
    275 
    276   // Load additional context when doing both filters.
    277   s0 = vld1q_u8(src_ptr);
    278   src_ptr += src_pixels_per_line;
    279   s1 = vld1q_u8(src_ptr);
    280   src_ptr += src_pixels_per_line;
    281   s2 = vld1q_u8(src_ptr);
    282   src_ptr += src_pixels_per_line;
    283   s3 = vld1q_u8(src_ptr);
    284   src_ptr += src_pixels_per_line;
    285   s4 = vld1q_u8(src_ptr);
    286 
    287   s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
    288   s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
    289   s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
    290   s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
    291   s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
    292 
    293   // 3 4 5 6 * filter0
    294   s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
    295   s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
    296 
    297   s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
    298   s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
    299   // But this time instead of 16 pixels to filter, there are 20. So an extra
    300   // run with a doubleword register.
    301   d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
    302   d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
    303   d89 = vmull_u8(s4_f5, filter5);
    304 
    305   // Save a copy as u64 for shifting.
    306   s01 = vreinterpretq_u64_u8(s01_f0);
    307   s23 = vreinterpretq_u64_u8(s23_f0);
    308 
    309   filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
    310   d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
    311 
    312   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
    313                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
    314                         &d0123, &d4567);
    315   s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
    316   d89 = vmlsl_u8(d89, s4_f1, filter1);
    317 
    318   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
    319                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
    320                         &d0123, &d4567);
    321   s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
    322   d89 = vmlsl_u8(d89, s4_f4, filter4);
    323 
    324   filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
    325                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
    326                         &d0123, &d4567);
    327   s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
    328   d89 = vmlal_u8(d89, s4_f2, filter2);
    329 
    330   s01_f3 = vshrq_n_u64(s01, 24);
    331   s23_f3 = vshrq_n_u64(s23, 24);
    332   s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
    333                       vreinterpret_u32_u64(vget_high_u64(s01_f3)));
    334   s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
    335                       vreinterpret_u32_u64(vget_high_u64(s23_f3)));
    336   s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
    337   d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
    338   d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
    339   d89_a = vmull_u8(s4_f3, filter3);
    340 
    341   e0123 =
    342       vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
    343   e4567 =
    344       vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
    345   e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
    346 
    347   b4 = vqrshrun_n_s16(e0123, 7);
    348   b6 = vqrshrun_n_s16(e4567, 7);
    349   b8 = vqrshrun_n_s16(e89, 7);
    350 
    351   // Second pass: 4x4
    352   filter = vld1_u8(abs_filters[yoffset]);
    353   filter0 = vdup_lane_u8(filter, 0);
    354   filter1 = vdup_lane_u8(filter, 1);
    355   filter2 = vdup_lane_u8(filter, 2);
    356   filter3 = vdup_lane_u8(filter, 3);
    357   filter4 = vdup_lane_u8(filter, 4);
    358   filter5 = vdup_lane_u8(filter, 5);
    359 
    360   b1 = vext_u8(b0, b2, 4);
    361   b3 = vext_u8(b2, b4, 4);
    362   b5 = vext_u8(b4, b6, 4);
    363   b7 = vext_u8(b6, b8, 4);
    364 
    365   c0 = vmull_u8(b0, filter0);
    366   c1 = vmull_u8(b2, filter0);
    367   c2 = vmull_u8(b5, filter5);
    368   c3 = vmull_u8(b7, filter5);
    369 
    370   c0 = vmlsl_u8(c0, b4, filter4);
    371   c1 = vmlsl_u8(c1, b6, filter4);
    372   c2 = vmlsl_u8(c2, b1, filter1);
    373   c3 = vmlsl_u8(c3, b3, filter1);
    374 
    375   c0 = vmlal_u8(c0, b2, filter2);
    376   c1 = vmlal_u8(c1, b4, filter2);
    377   c2 = vmlal_u8(c2, b3, filter3);
    378   c3 = vmlal_u8(c3, b5, filter3);
    379 
    380   d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
    381   d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
    382 
    383   e0 = vqrshrun_n_s16(d0, 7);
    384   e1 = vqrshrun_n_s16(d1, 7);
    385 
    386   store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
    387 }
    388 
    389 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
    390                                 int xoffset, int yoffset,
    391                                 unsigned char *dst_ptr, int dst_pitch) {
    392   unsigned char *src;
    393   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    394   uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
    395   uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
    396   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    397   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
    398   uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
    399   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
    400   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
    401   uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
    402 
    403   if (xoffset == 0) {  // secondpass_filter8x4_only
    404     // load second_pass filter
    405     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    406     d0s8 = vdup_lane_s8(dtmps8, 0);
    407     d1s8 = vdup_lane_s8(dtmps8, 1);
    408     d2s8 = vdup_lane_s8(dtmps8, 2);
    409     d3s8 = vdup_lane_s8(dtmps8, 3);
    410     d4s8 = vdup_lane_s8(dtmps8, 4);
    411     d5s8 = vdup_lane_s8(dtmps8, 5);
    412     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    413     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    414     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    415     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    416     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    417     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    418 
    419     // load src data
    420     src = src_ptr - src_pixels_per_line * 2;
    421     d22u8 = vld1_u8(src);
    422     src += src_pixels_per_line;
    423     d23u8 = vld1_u8(src);
    424     src += src_pixels_per_line;
    425     d24u8 = vld1_u8(src);
    426     src += src_pixels_per_line;
    427     d25u8 = vld1_u8(src);
    428     src += src_pixels_per_line;
    429     d26u8 = vld1_u8(src);
    430     src += src_pixels_per_line;
    431     d27u8 = vld1_u8(src);
    432     src += src_pixels_per_line;
    433     d28u8 = vld1_u8(src);
    434     src += src_pixels_per_line;
    435     d29u8 = vld1_u8(src);
    436     src += src_pixels_per_line;
    437     d30u8 = vld1_u8(src);
    438 
    439     q3u16 = vmull_u8(d22u8, d0u8);
    440     q4u16 = vmull_u8(d23u8, d0u8);
    441     q5u16 = vmull_u8(d24u8, d0u8);
    442     q6u16 = vmull_u8(d25u8, d0u8);
    443 
    444     q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
    445     q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
    446     q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
    447     q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
    448 
    449     q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
    450     q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
    451     q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
    452     q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
    453 
    454     q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
    455     q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
    456     q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
    457     q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
    458 
    459     q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
    460     q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
    461     q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
    462     q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
    463 
    464     q7u16 = vmull_u8(d25u8, d3u8);
    465     q8u16 = vmull_u8(d26u8, d3u8);
    466     q9u16 = vmull_u8(d27u8, d3u8);
    467     q10u16 = vmull_u8(d28u8, d3u8);
    468 
    469     q3s16 = vreinterpretq_s16_u16(q3u16);
    470     q4s16 = vreinterpretq_s16_u16(q4u16);
    471     q5s16 = vreinterpretq_s16_u16(q5u16);
    472     q6s16 = vreinterpretq_s16_u16(q6u16);
    473     q7s16 = vreinterpretq_s16_u16(q7u16);
    474     q8s16 = vreinterpretq_s16_u16(q8u16);
    475     q9s16 = vreinterpretq_s16_u16(q9u16);
    476     q10s16 = vreinterpretq_s16_u16(q10u16);
    477 
    478     q7s16 = vqaddq_s16(q7s16, q3s16);
    479     q8s16 = vqaddq_s16(q8s16, q4s16);
    480     q9s16 = vqaddq_s16(q9s16, q5s16);
    481     q10s16 = vqaddq_s16(q10s16, q6s16);
    482 
    483     d6u8 = vqrshrun_n_s16(q7s16, 7);
    484     d7u8 = vqrshrun_n_s16(q8s16, 7);
    485     d8u8 = vqrshrun_n_s16(q9s16, 7);
    486     d9u8 = vqrshrun_n_s16(q10s16, 7);
    487 
    488     vst1_u8(dst_ptr, d6u8);
    489     dst_ptr += dst_pitch;
    490     vst1_u8(dst_ptr, d7u8);
    491     dst_ptr += dst_pitch;
    492     vst1_u8(dst_ptr, d8u8);
    493     dst_ptr += dst_pitch;
    494     vst1_u8(dst_ptr, d9u8);
    495     return;
    496   }
    497 
    498   // load first_pass filter
    499   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    500   d0s8 = vdup_lane_s8(dtmps8, 0);
    501   d1s8 = vdup_lane_s8(dtmps8, 1);
    502   d2s8 = vdup_lane_s8(dtmps8, 2);
    503   d3s8 = vdup_lane_s8(dtmps8, 3);
    504   d4s8 = vdup_lane_s8(dtmps8, 4);
    505   d5s8 = vdup_lane_s8(dtmps8, 5);
    506   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    507   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    508   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    509   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    510   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    511   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    512 
    513   // First pass: output_height lines x output_width columns (9x4)
    514   if (yoffset == 0)  // firstpass_filter4x4_only
    515     src = src_ptr - 2;
    516   else
    517     src = src_ptr - 2 - (src_pixels_per_line * 2);
    518   q3u8 = vld1q_u8(src);
    519   src += src_pixels_per_line;
    520   q4u8 = vld1q_u8(src);
    521   src += src_pixels_per_line;
    522   q5u8 = vld1q_u8(src);
    523   src += src_pixels_per_line;
    524   q6u8 = vld1q_u8(src);
    525 
    526   q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    527   q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    528   q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    529   q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    530 
    531   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    532   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    533   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    534   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    535 
    536   q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
    537   q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
    538   q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
    539   q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
    540 
    541   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    542   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    543   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    544   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    545 
    546   q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
    547   q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
    548   q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
    549   q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
    550 
    551   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    552   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    553   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    554   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    555 
    556   q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
    557   q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
    558   q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
    559   q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
    560 
    561   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    562   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    563   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    564   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    565 
    566   q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
    567   q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
    568   q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
    569   q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
    570 
    571   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
    572   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
    573   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
    574   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
    575 
    576   q3u16 = vmull_u8(d28u8, d3u8);
    577   q4u16 = vmull_u8(d29u8, d3u8);
    578   q5u16 = vmull_u8(d30u8, d3u8);
    579   q6u16 = vmull_u8(d31u8, d3u8);
    580 
    581   q3s16 = vreinterpretq_s16_u16(q3u16);
    582   q4s16 = vreinterpretq_s16_u16(q4u16);
    583   q5s16 = vreinterpretq_s16_u16(q5u16);
    584   q6s16 = vreinterpretq_s16_u16(q6u16);
    585   q7s16 = vreinterpretq_s16_u16(q7u16);
    586   q8s16 = vreinterpretq_s16_u16(q8u16);
    587   q9s16 = vreinterpretq_s16_u16(q9u16);
    588   q10s16 = vreinterpretq_s16_u16(q10u16);
    589 
    590   q7s16 = vqaddq_s16(q7s16, q3s16);
    591   q8s16 = vqaddq_s16(q8s16, q4s16);
    592   q9s16 = vqaddq_s16(q9s16, q5s16);
    593   q10s16 = vqaddq_s16(q10s16, q6s16);
    594 
    595   d22u8 = vqrshrun_n_s16(q7s16, 7);
    596   d23u8 = vqrshrun_n_s16(q8s16, 7);
    597   d24u8 = vqrshrun_n_s16(q9s16, 7);
    598   d25u8 = vqrshrun_n_s16(q10s16, 7);
    599 
    600   if (yoffset == 0) {  // firstpass_filter8x4_only
    601     vst1_u8(dst_ptr, d22u8);
    602     dst_ptr += dst_pitch;
    603     vst1_u8(dst_ptr, d23u8);
    604     dst_ptr += dst_pitch;
    605     vst1_u8(dst_ptr, d24u8);
    606     dst_ptr += dst_pitch;
    607     vst1_u8(dst_ptr, d25u8);
    608     return;
    609   }
    610 
    611   // First Pass on rest 5-line data
    612   src += src_pixels_per_line;
    613   q3u8 = vld1q_u8(src);
    614   src += src_pixels_per_line;
    615   q4u8 = vld1q_u8(src);
    616   src += src_pixels_per_line;
    617   q5u8 = vld1q_u8(src);
    618   src += src_pixels_per_line;
    619   q6u8 = vld1q_u8(src);
    620   src += src_pixels_per_line;
    621   q7u8 = vld1q_u8(src);
    622 
    623   q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    624   q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    625   q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    626   q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    627   q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
    628 
    629   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    630   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    631   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    632   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    633   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
    634 
    635   q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
    636   q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
    637   q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
    638   q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
    639   q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
    640 
    641   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    642   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    643   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    644   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    645   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
    646 
    647   q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
    648   q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
    649   q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
    650   q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
    651   q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
    652 
    653   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    654   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    655   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    656   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    657   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
    658 
    659   q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
    660   q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
    661   q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
    662   q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
    663   q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
    664 
    665   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    666   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    667   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    668   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    669   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
    670 
    671   q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
    672   q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
    673   q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
    674   q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
    675   q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
    676 
    677   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
    678   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
    679   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
    680   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
    681   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
    682 
    683   q3u16 = vmull_u8(d27u8, d3u8);
    684   q4u16 = vmull_u8(d28u8, d3u8);
    685   q5u16 = vmull_u8(d29u8, d3u8);
    686   q6u16 = vmull_u8(d30u8, d3u8);
    687   q7u16 = vmull_u8(d31u8, d3u8);
    688 
    689   q3s16 = vreinterpretq_s16_u16(q3u16);
    690   q4s16 = vreinterpretq_s16_u16(q4u16);
    691   q5s16 = vreinterpretq_s16_u16(q5u16);
    692   q6s16 = vreinterpretq_s16_u16(q6u16);
    693   q7s16 = vreinterpretq_s16_u16(q7u16);
    694   q8s16 = vreinterpretq_s16_u16(q8u16);
    695   q9s16 = vreinterpretq_s16_u16(q9u16);
    696   q10s16 = vreinterpretq_s16_u16(q10u16);
    697   q11s16 = vreinterpretq_s16_u16(q11u16);
    698   q12s16 = vreinterpretq_s16_u16(q12u16);
    699 
    700   q8s16 = vqaddq_s16(q8s16, q3s16);
    701   q9s16 = vqaddq_s16(q9s16, q4s16);
    702   q10s16 = vqaddq_s16(q10s16, q5s16);
    703   q11s16 = vqaddq_s16(q11s16, q6s16);
    704   q12s16 = vqaddq_s16(q12s16, q7s16);
    705 
    706   d26u8 = vqrshrun_n_s16(q8s16, 7);
    707   d27u8 = vqrshrun_n_s16(q9s16, 7);
    708   d28u8 = vqrshrun_n_s16(q10s16, 7);
    709   d29u8 = vqrshrun_n_s16(q11s16, 7);
    710   d30u8 = vqrshrun_n_s16(q12s16, 7);
    711 
    712   // Second pass: 8x4
    713   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    714   d0s8 = vdup_lane_s8(dtmps8, 0);
    715   d1s8 = vdup_lane_s8(dtmps8, 1);
    716   d2s8 = vdup_lane_s8(dtmps8, 2);
    717   d3s8 = vdup_lane_s8(dtmps8, 3);
    718   d4s8 = vdup_lane_s8(dtmps8, 4);
    719   d5s8 = vdup_lane_s8(dtmps8, 5);
    720   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    721   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    722   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    723   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    724   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    725   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    726 
    727   q3u16 = vmull_u8(d22u8, d0u8);
    728   q4u16 = vmull_u8(d23u8, d0u8);
    729   q5u16 = vmull_u8(d24u8, d0u8);
    730   q6u16 = vmull_u8(d25u8, d0u8);
    731 
    732   q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
    733   q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
    734   q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
    735   q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
    736 
    737   q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
    738   q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
    739   q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
    740   q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
    741 
    742   q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
    743   q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
    744   q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
    745   q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
    746 
    747   q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
    748   q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
    749   q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
    750   q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
    751 
    752   q7u16 = vmull_u8(d25u8, d3u8);
    753   q8u16 = vmull_u8(d26u8, d3u8);
    754   q9u16 = vmull_u8(d27u8, d3u8);
    755   q10u16 = vmull_u8(d28u8, d3u8);
    756 
    757   q3s16 = vreinterpretq_s16_u16(q3u16);
    758   q4s16 = vreinterpretq_s16_u16(q4u16);
    759   q5s16 = vreinterpretq_s16_u16(q5u16);
    760   q6s16 = vreinterpretq_s16_u16(q6u16);
    761   q7s16 = vreinterpretq_s16_u16(q7u16);
    762   q8s16 = vreinterpretq_s16_u16(q8u16);
    763   q9s16 = vreinterpretq_s16_u16(q9u16);
    764   q10s16 = vreinterpretq_s16_u16(q10u16);
    765 
    766   q7s16 = vqaddq_s16(q7s16, q3s16);
    767   q8s16 = vqaddq_s16(q8s16, q4s16);
    768   q9s16 = vqaddq_s16(q9s16, q5s16);
    769   q10s16 = vqaddq_s16(q10s16, q6s16);
    770 
    771   d6u8 = vqrshrun_n_s16(q7s16, 7);
    772   d7u8 = vqrshrun_n_s16(q8s16, 7);
    773   d8u8 = vqrshrun_n_s16(q9s16, 7);
    774   d9u8 = vqrshrun_n_s16(q10s16, 7);
    775 
    776   vst1_u8(dst_ptr, d6u8);
    777   dst_ptr += dst_pitch;
    778   vst1_u8(dst_ptr, d7u8);
    779   dst_ptr += dst_pitch;
    780   vst1_u8(dst_ptr, d8u8);
    781   dst_ptr += dst_pitch;
    782   vst1_u8(dst_ptr, d9u8);
    783   return;
    784 }
    785 
    786 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
    787                                 int xoffset, int yoffset,
    788                                 unsigned char *dst_ptr, int dst_pitch) {
    789   unsigned char *src, *tmpp;
    790   unsigned char tmp[64];
    791   int i;
    792   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    793   uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
    794   uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
    795   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    796   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
    797   uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
    798   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
    799   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
    800   uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
    801 
    802   if (xoffset == 0) {  // secondpass_filter8x8_only
    803     // load second_pass filter
    804     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    805     d0s8 = vdup_lane_s8(dtmps8, 0);
    806     d1s8 = vdup_lane_s8(dtmps8, 1);
    807     d2s8 = vdup_lane_s8(dtmps8, 2);
    808     d3s8 = vdup_lane_s8(dtmps8, 3);
    809     d4s8 = vdup_lane_s8(dtmps8, 4);
    810     d5s8 = vdup_lane_s8(dtmps8, 5);
    811     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    812     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    813     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    814     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    815     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    816     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    817 
    818     // load src data
    819     src = src_ptr - src_pixels_per_line * 2;
    820     d18u8 = vld1_u8(src);
    821     src += src_pixels_per_line;
    822     d19u8 = vld1_u8(src);
    823     src += src_pixels_per_line;
    824     d20u8 = vld1_u8(src);
    825     src += src_pixels_per_line;
    826     d21u8 = vld1_u8(src);
    827     src += src_pixels_per_line;
    828     d22u8 = vld1_u8(src);
    829     src += src_pixels_per_line;
    830     d23u8 = vld1_u8(src);
    831     src += src_pixels_per_line;
    832     d24u8 = vld1_u8(src);
    833     src += src_pixels_per_line;
    834     d25u8 = vld1_u8(src);
    835     src += src_pixels_per_line;
    836     d26u8 = vld1_u8(src);
    837     src += src_pixels_per_line;
    838     d27u8 = vld1_u8(src);
    839     src += src_pixels_per_line;
    840     d28u8 = vld1_u8(src);
    841     src += src_pixels_per_line;
    842     d29u8 = vld1_u8(src);
    843     src += src_pixels_per_line;
    844     d30u8 = vld1_u8(src);
    845 
    846     for (i = 2; i > 0; i--) {
    847       q3u16 = vmull_u8(d18u8, d0u8);
    848       q4u16 = vmull_u8(d19u8, d0u8);
    849       q5u16 = vmull_u8(d20u8, d0u8);
    850       q6u16 = vmull_u8(d21u8, d0u8);
    851 
    852       q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
    853       q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
    854       q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
    855       q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
    856 
    857       q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
    858       q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
    859       q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
    860       q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
    861 
    862       q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
    863       q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
    864       q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
    865       q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
    866 
    867       q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
    868       q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
    869       q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
    870       q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
    871 
    872       q7u16 = vmull_u8(d21u8, d3u8);
    873       q8u16 = vmull_u8(d22u8, d3u8);
    874       q9u16 = vmull_u8(d23u8, d3u8);
    875       q10u16 = vmull_u8(d24u8, d3u8);
    876 
    877       q3s16 = vreinterpretq_s16_u16(q3u16);
    878       q4s16 = vreinterpretq_s16_u16(q4u16);
    879       q5s16 = vreinterpretq_s16_u16(q5u16);
    880       q6s16 = vreinterpretq_s16_u16(q6u16);
    881       q7s16 = vreinterpretq_s16_u16(q7u16);
    882       q8s16 = vreinterpretq_s16_u16(q8u16);
    883       q9s16 = vreinterpretq_s16_u16(q9u16);
    884       q10s16 = vreinterpretq_s16_u16(q10u16);
    885 
    886       q7s16 = vqaddq_s16(q7s16, q3s16);
    887       q8s16 = vqaddq_s16(q8s16, q4s16);
    888       q9s16 = vqaddq_s16(q9s16, q5s16);
    889       q10s16 = vqaddq_s16(q10s16, q6s16);
    890 
    891       d6u8 = vqrshrun_n_s16(q7s16, 7);
    892       d7u8 = vqrshrun_n_s16(q8s16, 7);
    893       d8u8 = vqrshrun_n_s16(q9s16, 7);
    894       d9u8 = vqrshrun_n_s16(q10s16, 7);
    895 
    896       d18u8 = d22u8;
    897       d19u8 = d23u8;
    898       d20u8 = d24u8;
    899       d21u8 = d25u8;
    900       d22u8 = d26u8;
    901       d23u8 = d27u8;
    902       d24u8 = d28u8;
    903       d25u8 = d29u8;
    904       d26u8 = d30u8;
    905 
    906       vst1_u8(dst_ptr, d6u8);
    907       dst_ptr += dst_pitch;
    908       vst1_u8(dst_ptr, d7u8);
    909       dst_ptr += dst_pitch;
    910       vst1_u8(dst_ptr, d8u8);
    911       dst_ptr += dst_pitch;
    912       vst1_u8(dst_ptr, d9u8);
    913       dst_ptr += dst_pitch;
    914     }
    915     return;
    916   }
    917 
    918   // load first_pass filter
    919   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    920   d0s8 = vdup_lane_s8(dtmps8, 0);
    921   d1s8 = vdup_lane_s8(dtmps8, 1);
    922   d2s8 = vdup_lane_s8(dtmps8, 2);
    923   d3s8 = vdup_lane_s8(dtmps8, 3);
    924   d4s8 = vdup_lane_s8(dtmps8, 4);
    925   d5s8 = vdup_lane_s8(dtmps8, 5);
    926   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    927   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    928   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    929   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    930   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    931   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
    932 
    933   // First pass: output_height lines x output_width columns (9x4)
    934   if (yoffset == 0)  // firstpass_filter4x4_only
    935     src = src_ptr - 2;
    936   else
    937     src = src_ptr - 2 - (src_pixels_per_line * 2);
    938 
    939   tmpp = tmp;
    940   for (i = 2; i > 0; i--) {
    941     q3u8 = vld1q_u8(src);
    942     src += src_pixels_per_line;
    943     q4u8 = vld1q_u8(src);
    944     src += src_pixels_per_line;
    945     q5u8 = vld1q_u8(src);
    946     src += src_pixels_per_line;
    947     q6u8 = vld1q_u8(src);
    948     src += src_pixels_per_line;
    949 
    950     __builtin_prefetch(src);
    951     __builtin_prefetch(src + src_pixels_per_line);
    952     __builtin_prefetch(src + src_pixels_per_line * 2);
    953 
    954     q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
    955     q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
    956     q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
    957     q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
    958 
    959     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
    960     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
    961     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
    962     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
    963 
    964     q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
    965     q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
    966     q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
    967     q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
    968 
    969     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
    970     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
    971     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
    972     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
    973 
    974     q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
    975     q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
    976     q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
    977     q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
    978 
    979     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
    980     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
    981     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
    982     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
    983 
    984     q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
    985     q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
    986     q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
    987     q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
    988 
    989     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
    990     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
    991     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
    992     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
    993 
    994     q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
    995     q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
    996     q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
    997     q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
    998 
    999     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
   1000     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
   1001     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
   1002     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
   1003 
   1004     q3u16 = vmull_u8(d28u8, d3u8);
   1005     q4u16 = vmull_u8(d29u8, d3u8);
   1006     q5u16 = vmull_u8(d30u8, d3u8);
   1007     q6u16 = vmull_u8(d31u8, d3u8);
   1008 
   1009     q3s16 = vreinterpretq_s16_u16(q3u16);
   1010     q4s16 = vreinterpretq_s16_u16(q4u16);
   1011     q5s16 = vreinterpretq_s16_u16(q5u16);
   1012     q6s16 = vreinterpretq_s16_u16(q6u16);
   1013     q7s16 = vreinterpretq_s16_u16(q7u16);
   1014     q8s16 = vreinterpretq_s16_u16(q8u16);
   1015     q9s16 = vreinterpretq_s16_u16(q9u16);
   1016     q10s16 = vreinterpretq_s16_u16(q10u16);
   1017 
   1018     q7s16 = vqaddq_s16(q7s16, q3s16);
   1019     q8s16 = vqaddq_s16(q8s16, q4s16);
   1020     q9s16 = vqaddq_s16(q9s16, q5s16);
   1021     q10s16 = vqaddq_s16(q10s16, q6s16);
   1022 
   1023     d22u8 = vqrshrun_n_s16(q7s16, 7);
   1024     d23u8 = vqrshrun_n_s16(q8s16, 7);
   1025     d24u8 = vqrshrun_n_s16(q9s16, 7);
   1026     d25u8 = vqrshrun_n_s16(q10s16, 7);
   1027 
   1028     if (yoffset == 0) {  // firstpass_filter8x4_only
   1029       vst1_u8(dst_ptr, d22u8);
   1030       dst_ptr += dst_pitch;
   1031       vst1_u8(dst_ptr, d23u8);
   1032       dst_ptr += dst_pitch;
   1033       vst1_u8(dst_ptr, d24u8);
   1034       dst_ptr += dst_pitch;
   1035       vst1_u8(dst_ptr, d25u8);
   1036       dst_ptr += dst_pitch;
   1037     } else {
   1038       vst1_u8(tmpp, d22u8);
   1039       tmpp += 8;
   1040       vst1_u8(tmpp, d23u8);
   1041       tmpp += 8;
   1042       vst1_u8(tmpp, d24u8);
   1043       tmpp += 8;
   1044       vst1_u8(tmpp, d25u8);
   1045       tmpp += 8;
   1046     }
   1047   }
   1048   if (yoffset == 0) return;
   1049 
   1050   // First Pass on rest 5-line data
   1051   q3u8 = vld1q_u8(src);
   1052   src += src_pixels_per_line;
   1053   q4u8 = vld1q_u8(src);
   1054   src += src_pixels_per_line;
   1055   q5u8 = vld1q_u8(src);
   1056   src += src_pixels_per_line;
   1057   q6u8 = vld1q_u8(src);
   1058   src += src_pixels_per_line;
   1059   q7u8 = vld1q_u8(src);
   1060 
   1061   q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
   1062   q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
   1063   q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
   1064   q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
   1065   q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
   1066 
   1067   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
   1068   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
   1069   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
   1070   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
   1071   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
   1072 
   1073   q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
   1074   q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
   1075   q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
   1076   q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
   1077   q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
   1078 
   1079   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
   1080   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
   1081   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
   1082   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
   1083   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
   1084 
   1085   q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
   1086   q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
   1087   q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
   1088   q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
   1089   q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
   1090 
   1091   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
   1092   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
   1093   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
   1094   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
   1095   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
   1096 
   1097   q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
   1098   q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
   1099   q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
   1100   q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
   1101   q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
   1102 
   1103   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
   1104   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
   1105   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
   1106   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
   1107   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
   1108 
   1109   q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
   1110   q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
   1111   q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
   1112   q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
   1113   q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
   1114 
   1115   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
   1116   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
   1117   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
   1118   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
   1119   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
   1120 
   1121   q3u16 = vmull_u8(d27u8, d3u8);
   1122   q4u16 = vmull_u8(d28u8, d3u8);
   1123   q5u16 = vmull_u8(d29u8, d3u8);
   1124   q6u16 = vmull_u8(d30u8, d3u8);
   1125   q7u16 = vmull_u8(d31u8, d3u8);
   1126 
   1127   q3s16 = vreinterpretq_s16_u16(q3u16);
   1128   q4s16 = vreinterpretq_s16_u16(q4u16);
   1129   q5s16 = vreinterpretq_s16_u16(q5u16);
   1130   q6s16 = vreinterpretq_s16_u16(q6u16);
   1131   q7s16 = vreinterpretq_s16_u16(q7u16);
   1132   q8s16 = vreinterpretq_s16_u16(q8u16);
   1133   q9s16 = vreinterpretq_s16_u16(q9u16);
   1134   q10s16 = vreinterpretq_s16_u16(q10u16);
   1135   q11s16 = vreinterpretq_s16_u16(q11u16);
   1136   q12s16 = vreinterpretq_s16_u16(q12u16);
   1137 
   1138   q8s16 = vqaddq_s16(q8s16, q3s16);
   1139   q9s16 = vqaddq_s16(q9s16, q4s16);
   1140   q10s16 = vqaddq_s16(q10s16, q5s16);
   1141   q11s16 = vqaddq_s16(q11s16, q6s16);
   1142   q12s16 = vqaddq_s16(q12s16, q7s16);
   1143 
   1144   d26u8 = vqrshrun_n_s16(q8s16, 7);
   1145   d27u8 = vqrshrun_n_s16(q9s16, 7);
   1146   d28u8 = vqrshrun_n_s16(q10s16, 7);
   1147   d29u8 = vqrshrun_n_s16(q11s16, 7);
   1148   d30u8 = vqrshrun_n_s16(q12s16, 7);
   1149 
   1150   // Second pass: 8x8
   1151   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1152   d0s8 = vdup_lane_s8(dtmps8, 0);
   1153   d1s8 = vdup_lane_s8(dtmps8, 1);
   1154   d2s8 = vdup_lane_s8(dtmps8, 2);
   1155   d3s8 = vdup_lane_s8(dtmps8, 3);
   1156   d4s8 = vdup_lane_s8(dtmps8, 4);
   1157   d5s8 = vdup_lane_s8(dtmps8, 5);
   1158   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1159   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1160   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1161   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1162   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1163   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1164 
   1165   tmpp = tmp;
   1166   q9u8 = vld1q_u8(tmpp);
   1167   tmpp += 16;
   1168   q10u8 = vld1q_u8(tmpp);
   1169   tmpp += 16;
   1170   q11u8 = vld1q_u8(tmpp);
   1171   tmpp += 16;
   1172   q12u8 = vld1q_u8(tmpp);
   1173 
   1174   d18u8 = vget_low_u8(q9u8);
   1175   d19u8 = vget_high_u8(q9u8);
   1176   d20u8 = vget_low_u8(q10u8);
   1177   d21u8 = vget_high_u8(q10u8);
   1178   d22u8 = vget_low_u8(q11u8);
   1179   d23u8 = vget_high_u8(q11u8);
   1180   d24u8 = vget_low_u8(q12u8);
   1181   d25u8 = vget_high_u8(q12u8);
   1182 
   1183   for (i = 2; i > 0; i--) {
   1184     q3u16 = vmull_u8(d18u8, d0u8);
   1185     q4u16 = vmull_u8(d19u8, d0u8);
   1186     q5u16 = vmull_u8(d20u8, d0u8);
   1187     q6u16 = vmull_u8(d21u8, d0u8);
   1188 
   1189     q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1190     q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1191     q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1192     q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1193 
   1194     q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1195     q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1196     q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1197     q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1198 
   1199     q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1200     q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1201     q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1202     q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1203 
   1204     q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1205     q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1206     q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1207     q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1208 
   1209     q7u16 = vmull_u8(d21u8, d3u8);
   1210     q8u16 = vmull_u8(d22u8, d3u8);
   1211     q9u16 = vmull_u8(d23u8, d3u8);
   1212     q10u16 = vmull_u8(d24u8, d3u8);
   1213 
   1214     q3s16 = vreinterpretq_s16_u16(q3u16);
   1215     q4s16 = vreinterpretq_s16_u16(q4u16);
   1216     q5s16 = vreinterpretq_s16_u16(q5u16);
   1217     q6s16 = vreinterpretq_s16_u16(q6u16);
   1218     q7s16 = vreinterpretq_s16_u16(q7u16);
   1219     q8s16 = vreinterpretq_s16_u16(q8u16);
   1220     q9s16 = vreinterpretq_s16_u16(q9u16);
   1221     q10s16 = vreinterpretq_s16_u16(q10u16);
   1222 
   1223     q7s16 = vqaddq_s16(q7s16, q3s16);
   1224     q8s16 = vqaddq_s16(q8s16, q4s16);
   1225     q9s16 = vqaddq_s16(q9s16, q5s16);
   1226     q10s16 = vqaddq_s16(q10s16, q6s16);
   1227 
   1228     d6u8 = vqrshrun_n_s16(q7s16, 7);
   1229     d7u8 = vqrshrun_n_s16(q8s16, 7);
   1230     d8u8 = vqrshrun_n_s16(q9s16, 7);
   1231     d9u8 = vqrshrun_n_s16(q10s16, 7);
   1232 
   1233     d18u8 = d22u8;
   1234     d19u8 = d23u8;
   1235     d20u8 = d24u8;
   1236     d21u8 = d25u8;
   1237     d22u8 = d26u8;
   1238     d23u8 = d27u8;
   1239     d24u8 = d28u8;
   1240     d25u8 = d29u8;
   1241     d26u8 = d30u8;
   1242 
   1243     vst1_u8(dst_ptr, d6u8);
   1244     dst_ptr += dst_pitch;
   1245     vst1_u8(dst_ptr, d7u8);
   1246     dst_ptr += dst_pitch;
   1247     vst1_u8(dst_ptr, d8u8);
   1248     dst_ptr += dst_pitch;
   1249     vst1_u8(dst_ptr, d9u8);
   1250     dst_ptr += dst_pitch;
   1251   }
   1252   return;
   1253 }
   1254 
   1255 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
   1256                                   int src_pixels_per_line, int xoffset,
   1257                                   int yoffset, unsigned char *dst_ptr,
   1258                                   int dst_pitch) {
   1259   unsigned char *src, *src_tmp, *dst, *tmpp;
   1260   unsigned char tmp[336];
   1261   int i, j;
   1262   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
   1263   uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
   1264   uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
   1265   uint8x8_t d28u8, d29u8, d30u8, d31u8;
   1266   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
   1267   uint8x16_t q3u8, q4u8;
   1268   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
   1269   uint16x8_t q11u16, q12u16, q13u16, q15u16;
   1270   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
   1271   int16x8_t q11s16, q12s16, q13s16, q15s16;
   1272 
   1273   if (xoffset == 0) {  // secondpass_filter8x8_only
   1274     // load second_pass filter
   1275     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1276     d0s8 = vdup_lane_s8(dtmps8, 0);
   1277     d1s8 = vdup_lane_s8(dtmps8, 1);
   1278     d2s8 = vdup_lane_s8(dtmps8, 2);
   1279     d3s8 = vdup_lane_s8(dtmps8, 3);
   1280     d4s8 = vdup_lane_s8(dtmps8, 4);
   1281     d5s8 = vdup_lane_s8(dtmps8, 5);
   1282     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1283     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1284     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1285     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1286     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1287     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1288 
   1289     // load src data
   1290     src_tmp = src_ptr - src_pixels_per_line * 2;
   1291     for (i = 0; i < 2; ++i) {
   1292       src = src_tmp + i * 8;
   1293       dst = dst_ptr + i * 8;
   1294       d18u8 = vld1_u8(src);
   1295       src += src_pixels_per_line;
   1296       d19u8 = vld1_u8(src);
   1297       src += src_pixels_per_line;
   1298       d20u8 = vld1_u8(src);
   1299       src += src_pixels_per_line;
   1300       d21u8 = vld1_u8(src);
   1301       src += src_pixels_per_line;
   1302       d22u8 = vld1_u8(src);
   1303       src += src_pixels_per_line;
   1304       for (j = 0; j < 4; ++j) {
   1305         d23u8 = vld1_u8(src);
   1306         src += src_pixels_per_line;
   1307         d24u8 = vld1_u8(src);
   1308         src += src_pixels_per_line;
   1309         d25u8 = vld1_u8(src);
   1310         src += src_pixels_per_line;
   1311         d26u8 = vld1_u8(src);
   1312         src += src_pixels_per_line;
   1313 
   1314         q3u16 = vmull_u8(d18u8, d0u8);
   1315         q4u16 = vmull_u8(d19u8, d0u8);
   1316         q5u16 = vmull_u8(d20u8, d0u8);
   1317         q6u16 = vmull_u8(d21u8, d0u8);
   1318 
   1319         q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1320         q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1321         q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1322         q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1323 
   1324         q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1325         q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1326         q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1327         q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1328 
   1329         q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1330         q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1331         q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1332         q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1333 
   1334         q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1335         q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1336         q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1337         q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1338 
   1339         q7u16 = vmull_u8(d21u8, d3u8);
   1340         q8u16 = vmull_u8(d22u8, d3u8);
   1341         q9u16 = vmull_u8(d23u8, d3u8);
   1342         q10u16 = vmull_u8(d24u8, d3u8);
   1343 
   1344         q3s16 = vreinterpretq_s16_u16(q3u16);
   1345         q4s16 = vreinterpretq_s16_u16(q4u16);
   1346         q5s16 = vreinterpretq_s16_u16(q5u16);
   1347         q6s16 = vreinterpretq_s16_u16(q6u16);
   1348         q7s16 = vreinterpretq_s16_u16(q7u16);
   1349         q8s16 = vreinterpretq_s16_u16(q8u16);
   1350         q9s16 = vreinterpretq_s16_u16(q9u16);
   1351         q10s16 = vreinterpretq_s16_u16(q10u16);
   1352 
   1353         q7s16 = vqaddq_s16(q7s16, q3s16);
   1354         q8s16 = vqaddq_s16(q8s16, q4s16);
   1355         q9s16 = vqaddq_s16(q9s16, q5s16);
   1356         q10s16 = vqaddq_s16(q10s16, q6s16);
   1357 
   1358         d6u8 = vqrshrun_n_s16(q7s16, 7);
   1359         d7u8 = vqrshrun_n_s16(q8s16, 7);
   1360         d8u8 = vqrshrun_n_s16(q9s16, 7);
   1361         d9u8 = vqrshrun_n_s16(q10s16, 7);
   1362 
   1363         d18u8 = d22u8;
   1364         d19u8 = d23u8;
   1365         d20u8 = d24u8;
   1366         d21u8 = d25u8;
   1367         d22u8 = d26u8;
   1368 
   1369         vst1_u8(dst, d6u8);
   1370         dst += dst_pitch;
   1371         vst1_u8(dst, d7u8);
   1372         dst += dst_pitch;
   1373         vst1_u8(dst, d8u8);
   1374         dst += dst_pitch;
   1375         vst1_u8(dst, d9u8);
   1376         dst += dst_pitch;
   1377       }
   1378     }
   1379     return;
   1380   }
   1381 
   1382   // load first_pass filter
   1383   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
   1384   d0s8 = vdup_lane_s8(dtmps8, 0);
   1385   d1s8 = vdup_lane_s8(dtmps8, 1);
   1386   d2s8 = vdup_lane_s8(dtmps8, 2);
   1387   d3s8 = vdup_lane_s8(dtmps8, 3);
   1388   d4s8 = vdup_lane_s8(dtmps8, 4);
   1389   d5s8 = vdup_lane_s8(dtmps8, 5);
   1390   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1391   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1392   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1393   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1394   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1395   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1396 
   1397   // First pass: output_height lines x output_width columns (9x4)
   1398   if (yoffset == 0) {  // firstpass_filter4x4_only
   1399     src = src_ptr - 2;
   1400     dst = dst_ptr;
   1401     for (i = 0; i < 8; ++i) {
   1402       d6u8 = vld1_u8(src);
   1403       d7u8 = vld1_u8(src + 8);
   1404       d8u8 = vld1_u8(src + 16);
   1405       src += src_pixels_per_line;
   1406       d9u8 = vld1_u8(src);
   1407       d10u8 = vld1_u8(src + 8);
   1408       d11u8 = vld1_u8(src + 16);
   1409       src += src_pixels_per_line;
   1410 
   1411       __builtin_prefetch(src);
   1412       __builtin_prefetch(src + src_pixels_per_line);
   1413 
   1414       q6u16 = vmull_u8(d6u8, d0u8);
   1415       q7u16 = vmull_u8(d7u8, d0u8);
   1416       q8u16 = vmull_u8(d9u8, d0u8);
   1417       q9u16 = vmull_u8(d10u8, d0u8);
   1418 
   1419       d20u8 = vext_u8(d6u8, d7u8, 1);
   1420       d21u8 = vext_u8(d9u8, d10u8, 1);
   1421       d22u8 = vext_u8(d7u8, d8u8, 1);
   1422       d23u8 = vext_u8(d10u8, d11u8, 1);
   1423       d24u8 = vext_u8(d6u8, d7u8, 4);
   1424       d25u8 = vext_u8(d9u8, d10u8, 4);
   1425       d26u8 = vext_u8(d7u8, d8u8, 4);
   1426       d27u8 = vext_u8(d10u8, d11u8, 4);
   1427       d28u8 = vext_u8(d6u8, d7u8, 5);
   1428       d29u8 = vext_u8(d9u8, d10u8, 5);
   1429 
   1430       q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
   1431       q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
   1432       q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
   1433       q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
   1434       q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
   1435       q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
   1436       q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
   1437       q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
   1438       q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
   1439       q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
   1440 
   1441       d20u8 = vext_u8(d7u8, d8u8, 5);
   1442       d21u8 = vext_u8(d10u8, d11u8, 5);
   1443       d22u8 = vext_u8(d6u8, d7u8, 2);
   1444       d23u8 = vext_u8(d9u8, d10u8, 2);
   1445       d24u8 = vext_u8(d7u8, d8u8, 2);
   1446       d25u8 = vext_u8(d10u8, d11u8, 2);
   1447       d26u8 = vext_u8(d6u8, d7u8, 3);
   1448       d27u8 = vext_u8(d9u8, d10u8, 3);
   1449       d28u8 = vext_u8(d7u8, d8u8, 3);
   1450       d29u8 = vext_u8(d10u8, d11u8, 3);
   1451 
   1452       q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
   1453       q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
   1454       q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
   1455       q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
   1456       q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
   1457       q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
   1458 
   1459       q10u16 = vmull_u8(d26u8, d3u8);
   1460       q11u16 = vmull_u8(d27u8, d3u8);
   1461       q12u16 = vmull_u8(d28u8, d3u8);
   1462       q15u16 = vmull_u8(d29u8, d3u8);
   1463 
   1464       q6s16 = vreinterpretq_s16_u16(q6u16);
   1465       q7s16 = vreinterpretq_s16_u16(q7u16);
   1466       q8s16 = vreinterpretq_s16_u16(q8u16);
   1467       q9s16 = vreinterpretq_s16_u16(q9u16);
   1468       q10s16 = vreinterpretq_s16_u16(q10u16);
   1469       q11s16 = vreinterpretq_s16_u16(q11u16);
   1470       q12s16 = vreinterpretq_s16_u16(q12u16);
   1471       q15s16 = vreinterpretq_s16_u16(q15u16);
   1472 
   1473       q6s16 = vqaddq_s16(q6s16, q10s16);
   1474       q8s16 = vqaddq_s16(q8s16, q11s16);
   1475       q7s16 = vqaddq_s16(q7s16, q12s16);
   1476       q9s16 = vqaddq_s16(q9s16, q15s16);
   1477 
   1478       d6u8 = vqrshrun_n_s16(q6s16, 7);
   1479       d7u8 = vqrshrun_n_s16(q7s16, 7);
   1480       d8u8 = vqrshrun_n_s16(q8s16, 7);
   1481       d9u8 = vqrshrun_n_s16(q9s16, 7);
   1482 
   1483       q3u8 = vcombine_u8(d6u8, d7u8);
   1484       q4u8 = vcombine_u8(d8u8, d9u8);
   1485       vst1q_u8(dst, q3u8);
   1486       dst += dst_pitch;
   1487       vst1q_u8(dst, q4u8);
   1488       dst += dst_pitch;
   1489     }
   1490     return;
   1491   }
   1492 
   1493   src = src_ptr - 2 - src_pixels_per_line * 2;
   1494   tmpp = tmp;
   1495   for (i = 0; i < 7; ++i) {
   1496     d6u8 = vld1_u8(src);
   1497     d7u8 = vld1_u8(src + 8);
   1498     d8u8 = vld1_u8(src + 16);
   1499     src += src_pixels_per_line;
   1500     d9u8 = vld1_u8(src);
   1501     d10u8 = vld1_u8(src + 8);
   1502     d11u8 = vld1_u8(src + 16);
   1503     src += src_pixels_per_line;
   1504     d12u8 = vld1_u8(src);
   1505     d13u8 = vld1_u8(src + 8);
   1506     d14u8 = vld1_u8(src + 16);
   1507     src += src_pixels_per_line;
   1508 
   1509     __builtin_prefetch(src);
   1510     __builtin_prefetch(src + src_pixels_per_line);
   1511     __builtin_prefetch(src + src_pixels_per_line * 2);
   1512 
   1513     q8u16 = vmull_u8(d6u8, d0u8);
   1514     q9u16 = vmull_u8(d7u8, d0u8);
   1515     q10u16 = vmull_u8(d9u8, d0u8);
   1516     q11u16 = vmull_u8(d10u8, d0u8);
   1517     q12u16 = vmull_u8(d12u8, d0u8);
   1518     q13u16 = vmull_u8(d13u8, d0u8);
   1519 
   1520     d28u8 = vext_u8(d6u8, d7u8, 1);
   1521     d29u8 = vext_u8(d9u8, d10u8, 1);
   1522     d30u8 = vext_u8(d12u8, d13u8, 1);
   1523     q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
   1524     q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
   1525     q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
   1526     d28u8 = vext_u8(d7u8, d8u8, 1);
   1527     d29u8 = vext_u8(d10u8, d11u8, 1);
   1528     d30u8 = vext_u8(d13u8, d14u8, 1);
   1529     q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
   1530     q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
   1531     q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
   1532 
   1533     d28u8 = vext_u8(d6u8, d7u8, 4);
   1534     d29u8 = vext_u8(d9u8, d10u8, 4);
   1535     d30u8 = vext_u8(d12u8, d13u8, 4);
   1536     q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
   1537     q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
   1538     q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
   1539     d28u8 = vext_u8(d7u8, d8u8, 4);
   1540     d29u8 = vext_u8(d10u8, d11u8, 4);
   1541     d30u8 = vext_u8(d13u8, d14u8, 4);
   1542     q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
   1543     q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
   1544     q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
   1545 
   1546     d28u8 = vext_u8(d6u8, d7u8, 5);
   1547     d29u8 = vext_u8(d9u8, d10u8, 5);
   1548     d30u8 = vext_u8(d12u8, d13u8, 5);
   1549     q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
   1550     q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
   1551     q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
   1552     d28u8 = vext_u8(d7u8, d8u8, 5);
   1553     d29u8 = vext_u8(d10u8, d11u8, 5);
   1554     d30u8 = vext_u8(d13u8, d14u8, 5);
   1555     q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
   1556     q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
   1557     q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
   1558 
   1559     d28u8 = vext_u8(d6u8, d7u8, 2);
   1560     d29u8 = vext_u8(d9u8, d10u8, 2);
   1561     d30u8 = vext_u8(d12u8, d13u8, 2);
   1562     q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
   1563     q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
   1564     q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
   1565     d28u8 = vext_u8(d7u8, d8u8, 2);
   1566     d29u8 = vext_u8(d10u8, d11u8, 2);
   1567     d30u8 = vext_u8(d13u8, d14u8, 2);
   1568     q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
   1569     q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
   1570     q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
   1571 
   1572     d28u8 = vext_u8(d6u8, d7u8, 3);
   1573     d29u8 = vext_u8(d9u8, d10u8, 3);
   1574     d30u8 = vext_u8(d12u8, d13u8, 3);
   1575     d15u8 = vext_u8(d7u8, d8u8, 3);
   1576     d31u8 = vext_u8(d10u8, d11u8, 3);
   1577     d6u8 = vext_u8(d13u8, d14u8, 3);
   1578     q4u16 = vmull_u8(d28u8, d3u8);
   1579     q5u16 = vmull_u8(d29u8, d3u8);
   1580     q6u16 = vmull_u8(d30u8, d3u8);
   1581     q4s16 = vreinterpretq_s16_u16(q4u16);
   1582     q5s16 = vreinterpretq_s16_u16(q5u16);
   1583     q6s16 = vreinterpretq_s16_u16(q6u16);
   1584     q8s16 = vreinterpretq_s16_u16(q8u16);
   1585     q10s16 = vreinterpretq_s16_u16(q10u16);
   1586     q12s16 = vreinterpretq_s16_u16(q12u16);
   1587     q8s16 = vqaddq_s16(q8s16, q4s16);
   1588     q10s16 = vqaddq_s16(q10s16, q5s16);
   1589     q12s16 = vqaddq_s16(q12s16, q6s16);
   1590 
   1591     q6u16 = vmull_u8(d15u8, d3u8);
   1592     q7u16 = vmull_u8(d31u8, d3u8);
   1593     q3u16 = vmull_u8(d6u8, d3u8);
   1594     q3s16 = vreinterpretq_s16_u16(q3u16);
   1595     q6s16 = vreinterpretq_s16_u16(q6u16);
   1596     q7s16 = vreinterpretq_s16_u16(q7u16);
   1597     q9s16 = vreinterpretq_s16_u16(q9u16);
   1598     q11s16 = vreinterpretq_s16_u16(q11u16);
   1599     q13s16 = vreinterpretq_s16_u16(q13u16);
   1600     q9s16 = vqaddq_s16(q9s16, q6s16);
   1601     q11s16 = vqaddq_s16(q11s16, q7s16);
   1602     q13s16 = vqaddq_s16(q13s16, q3s16);
   1603 
   1604     d6u8 = vqrshrun_n_s16(q8s16, 7);
   1605     d7u8 = vqrshrun_n_s16(q9s16, 7);
   1606     d8u8 = vqrshrun_n_s16(q10s16, 7);
   1607     d9u8 = vqrshrun_n_s16(q11s16, 7);
   1608     d10u8 = vqrshrun_n_s16(q12s16, 7);
   1609     d11u8 = vqrshrun_n_s16(q13s16, 7);
   1610 
   1611     vst1_u8(tmpp, d6u8);
   1612     tmpp += 8;
   1613     vst1_u8(tmpp, d7u8);
   1614     tmpp += 8;
   1615     vst1_u8(tmpp, d8u8);
   1616     tmpp += 8;
   1617     vst1_u8(tmpp, d9u8);
   1618     tmpp += 8;
   1619     vst1_u8(tmpp, d10u8);
   1620     tmpp += 8;
   1621     vst1_u8(tmpp, d11u8);
   1622     tmpp += 8;
   1623   }
   1624 
   1625   // Second pass: 16x16
   1626   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
   1627   d0s8 = vdup_lane_s8(dtmps8, 0);
   1628   d1s8 = vdup_lane_s8(dtmps8, 1);
   1629   d2s8 = vdup_lane_s8(dtmps8, 2);
   1630   d3s8 = vdup_lane_s8(dtmps8, 3);
   1631   d4s8 = vdup_lane_s8(dtmps8, 4);
   1632   d5s8 = vdup_lane_s8(dtmps8, 5);
   1633   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
   1634   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
   1635   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
   1636   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
   1637   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
   1638   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
   1639 
   1640   for (i = 0; i < 2; ++i) {
   1641     dst = dst_ptr + 8 * i;
   1642     tmpp = tmp + 8 * i;
   1643     d18u8 = vld1_u8(tmpp);
   1644     tmpp += 16;
   1645     d19u8 = vld1_u8(tmpp);
   1646     tmpp += 16;
   1647     d20u8 = vld1_u8(tmpp);
   1648     tmpp += 16;
   1649     d21u8 = vld1_u8(tmpp);
   1650     tmpp += 16;
   1651     d22u8 = vld1_u8(tmpp);
   1652     tmpp += 16;
   1653     for (j = 0; j < 4; ++j) {
   1654       d23u8 = vld1_u8(tmpp);
   1655       tmpp += 16;
   1656       d24u8 = vld1_u8(tmpp);
   1657       tmpp += 16;
   1658       d25u8 = vld1_u8(tmpp);
   1659       tmpp += 16;
   1660       d26u8 = vld1_u8(tmpp);
   1661       tmpp += 16;
   1662 
   1663       q3u16 = vmull_u8(d18u8, d0u8);
   1664       q4u16 = vmull_u8(d19u8, d0u8);
   1665       q5u16 = vmull_u8(d20u8, d0u8);
   1666       q6u16 = vmull_u8(d21u8, d0u8);
   1667 
   1668       q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
   1669       q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
   1670       q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
   1671       q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
   1672 
   1673       q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
   1674       q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
   1675       q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
   1676       q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
   1677 
   1678       q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
   1679       q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
   1680       q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
   1681       q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
   1682 
   1683       q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
   1684       q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
   1685       q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
   1686       q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
   1687 
   1688       q7u16 = vmull_u8(d21u8, d3u8);
   1689       q8u16 = vmull_u8(d22u8, d3u8);
   1690       q9u16 = vmull_u8(d23u8, d3u8);
   1691       q10u16 = vmull_u8(d24u8, d3u8);
   1692 
   1693       q3s16 = vreinterpretq_s16_u16(q3u16);
   1694       q4s16 = vreinterpretq_s16_u16(q4u16);
   1695       q5s16 = vreinterpretq_s16_u16(q5u16);
   1696       q6s16 = vreinterpretq_s16_u16(q6u16);
   1697       q7s16 = vreinterpretq_s16_u16(q7u16);
   1698       q8s16 = vreinterpretq_s16_u16(q8u16);
   1699       q9s16 = vreinterpretq_s16_u16(q9u16);
   1700       q10s16 = vreinterpretq_s16_u16(q10u16);
   1701 
   1702       q7s16 = vqaddq_s16(q7s16, q3s16);
   1703       q8s16 = vqaddq_s16(q8s16, q4s16);
   1704       q9s16 = vqaddq_s16(q9s16, q5s16);
   1705       q10s16 = vqaddq_s16(q10s16, q6s16);
   1706 
   1707       d6u8 = vqrshrun_n_s16(q7s16, 7);
   1708       d7u8 = vqrshrun_n_s16(q8s16, 7);
   1709       d8u8 = vqrshrun_n_s16(q9s16, 7);
   1710       d9u8 = vqrshrun_n_s16(q10s16, 7);
   1711 
   1712       d18u8 = d22u8;
   1713       d19u8 = d23u8;
   1714       d20u8 = d24u8;
   1715       d21u8 = d25u8;
   1716       d22u8 = d26u8;
   1717 
   1718       vst1_u8(dst, d6u8);
   1719       dst += dst_pitch;
   1720       vst1_u8(dst, d7u8);
   1721       dst += dst_pitch;
   1722       vst1_u8(dst, d8u8);
   1723       dst += dst_pitch;
   1724       vst1_u8(dst, d9u8);
   1725       dst += dst_pitch;
   1726     }
   1727   }
   1728   return;
   1729 }
   1730