Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include "./vpx_config.h"
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/arm/transpose_neon.h"
     15 
     16 // For all the static inline functions, the functions ending with '_8' process
     17 // 8 samples in a bunch, and the functions ending with '_16' process 16 samples
     18 // in a bunch.
     19 
     20 #define FUN_LOAD_THRESH(w, r)                                             \
     21   static INLINE void load_thresh_##w(                                     \
     22       const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
     23       uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec,                \
     24       uint8x##w##_t *thresh_vec) {                                        \
     25     *blimit_vec = vld1##r##dup_u8(blimit);                                \
     26     *limit_vec = vld1##r##dup_u8(limit);                                  \
     27     *thresh_vec = vld1##r##dup_u8(thresh);                                \
     28   }
     29 
     30 FUN_LOAD_THRESH(8, _)    // load_thresh_8
     31 FUN_LOAD_THRESH(16, q_)  // load_thresh_16
     32 #undef FUN_LOAD_THRESH
     33 
     34 static INLINE void load_thresh_8_dual(
     35     const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
     36     const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
     37     uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
     38   *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
     39   *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
     40   *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
     41 }
     42 
     43 // Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
     44 // pixel. When used to control filter branches, we only detect whether it is all
     45 // 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
     46 // flat equals 0 if and only if flat_status equals 0.
     47 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
     48 // because each mask occupies more than 1 bit.)
     49 static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
     50   return vget_lane_u32(
     51       vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
     52 }
     53 
     54 // Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
     55 // When used to control filter branches, we only detect whether it is all 0s or
     56 // all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
     57 // we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
     58 // Then we pairwise add flat to a 32-bit long number flat_status.
     59 // flat equals 0 if and only if flat_status equals 0.
     60 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
     61 // because each mask occupies more than 1 bit.)
     62 static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
     63   const uint8x8_t flat_4bit =
     64       vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
     65   return calc_flat_status_8(flat_4bit);
     66 }
     67 
     68 #define FUN_FILTER_HEV_MASK4(w, r)                                            \
     69   static INLINE uint8x##w##_t filter_hev_mask4_##w(                           \
     70       const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
     71       const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
     72       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
     73       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
     74       const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) {      \
     75     uint8x##w##_t max, t0, t1;                                                \
     76                                                                               \
     77     max = vabd##r##u8(p1, p0);                                                \
     78     max = vmax##r##u8(max, vabd##r##u8(q1, q0));                              \
     79     *hev = vcgt##r##u8(max, thresh);                                          \
     80     *mask = vmax##r##u8(max, vabd##r##u8(p3, p2));                            \
     81     *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1));                          \
     82     *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1));                          \
     83     *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2));                          \
     84     t0 = vabd##r##u8(p0, q0);                                                 \
     85     t1 = vabd##r##u8(p1, q1);                                                 \
     86     t0 = vqadd##r##u8(t0, t0);                                                \
     87     t1 = vshr##r##n_u8(t1, 1);                                                \
     88     t0 = vqadd##r##u8(t0, t1);                                                \
     89     *mask = vcle##r##u8(*mask, limit);                                        \
     90     t0 = vcle##r##u8(t0, blimit);                                             \
     91     *mask = vand##r##u8(*mask, t0);                                           \
     92                                                                               \
     93     return max;                                                               \
     94   }
     95 
     96 FUN_FILTER_HEV_MASK4(8, _)    // filter_hev_mask4_8
     97 FUN_FILTER_HEV_MASK4(16, q_)  // filter_hev_mask4_16
     98 #undef FUN_FILTER_HEV_MASK4
     99 
    100 #define FUN_FILTER_FLAT_HEV_MASK(w, r)                                        \
    101   static INLINE uint8x##w##_t filter_flat_hev_mask_##w(                       \
    102       const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
    103       const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
    104       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
    105       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
    106       const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status,     \
    107       uint8x##w##_t *hev) {                                                   \
    108     uint8x##w##_t max, mask;                                                  \
    109                                                                               \
    110     max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
    111                                q2, q3, hev, &mask);                           \
    112     *flat = vmax##r##u8(max, vabd##r##u8(p2, p0));                            \
    113     *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0));                          \
    114     *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0));                          \
    115     *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0));                          \
    116     *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */          \
    117     *flat = vand##r##u8(*flat, mask);                                         \
    118     *flat_status = calc_flat_status_##w(*flat);                               \
    119                                                                               \
    120     return mask;                                                              \
    121   }
    122 
    123 FUN_FILTER_FLAT_HEV_MASK(8, _)    // filter_flat_hev_mask_8
    124 FUN_FILTER_FLAT_HEV_MASK(16, q_)  // filter_flat_hev_mask_16
    125 #undef FUN_FILTER_FLAT_HEV_MASK
    126 
    127 #define FUN_FLAT_MASK5(w, r)                                                  \
    128   static INLINE uint8x##w##_t flat_mask5_##w(                                 \
    129       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
    130       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
    131       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
    132       const uint8x##w##_t q4, const uint8x##w##_t flat,                       \
    133       uint32_t *flat2_status) {                                               \
    134     uint8x##w##_t flat2 = vabd##r##u8(p4, p0);                                \
    135     flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0));                          \
    136     flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0));                          \
    137     flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0));                          \
    138     flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0));                          \
    139     flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0));                          \
    140     flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0));                          \
    141     flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0));                          \
    142     flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1));                             \
    143     flat2 = vand##r##u8(flat2, flat);                                         \
    144     *flat2_status = calc_flat_status_##w(flat2);                              \
    145                                                                               \
    146     return flat2;                                                             \
    147   }
    148 
    149 FUN_FLAT_MASK5(8, _)    // flat_mask5_8
    150 FUN_FLAT_MASK5(16, q_)  // flat_mask5_16
    151 #undef FUN_FLAT_MASK5
    152 
    153 #define FUN_FLIP_SIGN(w, r)                                         \
    154   static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
    155     const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80);             \
    156     return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit));        \
    157   }
    158 
    159 FUN_FLIP_SIGN(8, _)    // flip_sign_8
    160 FUN_FLIP_SIGN(16, q_)  // flip_sign_16
    161 #undef FUN_FLIP_SIGN
    162 
    163 #define FUN_FLIP_SIGN_BACK(w, r)                                         \
    164   static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
    165     const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
    166     return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
    167   }
    168 
    169 FUN_FLIP_SIGN_BACK(8, _)    // flip_sign_back_8
    170 FUN_FLIP_SIGN_BACK(16, q_)  // flip_sign_back_16
    171 #undef FUN_FLIP_SIGN_BACK
    172 
    173 static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
    174                                    const uint8x8_t add0, const uint8x8_t add1,
    175                                    uint16x8_t *sum) {
    176   *sum = vsubw_u8(*sum, sub0);
    177   *sum = vsubw_u8(*sum, sub1);
    178   *sum = vaddw_u8(*sum, add0);
    179   *sum = vaddw_u8(*sum, add1);
    180 }
    181 
    182 static INLINE void filter_update_16(const uint8x16_t sub0,
    183                                     const uint8x16_t sub1,
    184                                     const uint8x16_t add0,
    185                                     const uint8x16_t add1, uint16x8_t *sum0,
    186                                     uint16x8_t *sum1) {
    187   *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
    188   *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
    189   *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
    190   *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
    191   *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
    192   *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
    193   *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
    194   *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
    195 }
    196 
    197 static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
    198                                                    const uint8x8_t sub1,
    199                                                    const uint8x8_t add0,
    200                                                    const uint8x8_t add1,
    201                                                    uint16x8_t *sum) {
    202   filter_update_8(sub0, sub1, add0, add1, sum);
    203   return vrshrn_n_u16(*sum, 3);
    204 }
    205 
    206 static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
    207     const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
    208     const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
    209   filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
    210   return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
    211 }
    212 
    213 static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
    214     const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
    215     const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
    216     uint16x8_t *sum) {
    217   filter_update_8(sub0, sub1, add0, add1, sum);
    218   return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
    219 }
    220 
    221 static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
    222     const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
    223     const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
    224     uint16x8_t *sum0, uint16x8_t *sum1) {
    225   uint8x16_t t;
    226   filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
    227   t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
    228   return vbslq_u8(flat, t, in);
    229 }
    230 
    231 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
    232 static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
    233                                        const uint8x8_t p1, const uint8x8_t p0,
    234                                        const uint8x8_t q0, const uint8x8_t q1,
    235                                        const uint8x8_t q2, const uint8x8_t q3,
    236                                        uint8x8_t *op2, uint8x8_t *op1,
    237                                        uint8x8_t *op0, uint8x8_t *oq0,
    238                                        uint8x8_t *oq1, uint8x8_t *oq2) {
    239   uint16x8_t sum;
    240   sum = vaddl_u8(p3, p3);   // 2*p3
    241   sum = vaddw_u8(sum, p3);  // 3*p3
    242   sum = vaddw_u8(sum, p2);  // 3*p3+p2
    243   sum = vaddw_u8(sum, p2);  // 3*p3+2*p2
    244   sum = vaddw_u8(sum, p1);  // 3*p3+2*p2+p1
    245   sum = vaddw_u8(sum, p0);  // 3*p3+2*p2+p1+p0
    246   sum = vaddw_u8(sum, q0);  // 3*p3+2*p2+p1+p0+q0
    247   *op2 = vrshrn_n_u16(sum, 3);
    248   *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
    249   *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
    250   *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
    251   *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
    252   *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
    253 }
    254 
    255 static INLINE void calc_7_tap_filter_16(
    256     const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
    257     const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
    258     const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
    259     uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
    260   uint16x8_t sum0, sum1;
    261   sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3));    // 2*p3
    262   sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3));  // 2*p3
    263   sum0 = vaddw_u8(sum0, vget_low_u8(p3));               // 3*p3
    264   sum1 = vaddw_u8(sum1, vget_high_u8(p3));              // 3*p3
    265   sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+p2
    266   sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+p2
    267   sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+2*p2
    268   sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+2*p2
    269   sum0 = vaddw_u8(sum0, vget_low_u8(p1));               // 3*p3+2*p2+p1
    270   sum1 = vaddw_u8(sum1, vget_high_u8(p1));              // 3*p3+2*p2+p1
    271   sum0 = vaddw_u8(sum0, vget_low_u8(p0));               // 3*p3+2*p2+p1+p0
    272   sum1 = vaddw_u8(sum1, vget_high_u8(p0));              // 3*p3+2*p2+p1+p0
    273   sum0 = vaddw_u8(sum0, vget_low_u8(q0));               // 3*p3+2*p2+p1+p0+q0
    274   sum1 = vaddw_u8(sum1, vget_high_u8(q0));              // 3*p3+2*p2+p1+p0+q0
    275   *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
    276   *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
    277   *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
    278   *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
    279   *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
    280   *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
    281 }
    282 
    283 #define FUN_APPLY_7_TAP_FILTER(w, r)                                          \
    284   static INLINE void apply_7_tap_filter_##w(                                  \
    285       const uint8x##w##_t flat, const uint8x##w##_t p3,                       \
    286       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
    287       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
    288       const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1,         \
    289       uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1,             \
    290       uint8x##w##_t *oq2) {                                                   \
    291     uint8x##w##_t tp1, tp0, tq0, tq1;                                         \
    292     calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0,    \
    293                           &tq0, &tq1, oq2);                                   \
    294     *op2 = vbsl##r##u8(flat, *op2, p2);                                       \
    295     *op1 = vbsl##r##u8(flat, tp1, *op1);                                      \
    296     *op0 = vbsl##r##u8(flat, tp0, *op0);                                      \
    297     *oq0 = vbsl##r##u8(flat, tq0, *oq0);                                      \
    298     *oq1 = vbsl##r##u8(flat, tq1, *oq1);                                      \
    299     *oq2 = vbsl##r##u8(flat, *oq2, q2);                                       \
    300   }
    301 
    302 FUN_APPLY_7_TAP_FILTER(8, _)    // apply_7_tap_filter_8
    303 FUN_APPLY_7_TAP_FILTER(16, q_)  // apply_7_tap_filter_16
    304 #undef FUN_APPLY_7_TAP_FILTER
    305 
    306 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
    307 static INLINE void apply_15_tap_filter_8(
    308     const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
    309     const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
    310     const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
    311     const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
    312     const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
    313     const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
    314     uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
    315     uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
    316     uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
    317   uint16x8_t sum;
    318   sum = vshll_n_u8(p7, 3);  // 8*p7
    319   sum = vsubw_u8(sum, p7);  // 7*p7
    320   sum = vaddw_u8(sum, p6);  // 7*p7+p6
    321   sum = vaddw_u8(sum, p6);  // 7*p7+2*p6
    322   sum = vaddw_u8(sum, p5);  // 7*p7+2*p6+p5
    323   sum = vaddw_u8(sum, p4);  // 7*p7+2*p6+p5+p4
    324   sum = vaddw_u8(sum, p3);  // 7*p7+2*p6+p5+p4+p3
    325   sum = vaddw_u8(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
    326   sum = vaddw_u8(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
    327   sum = vaddw_u8(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
    328   sum = vaddw_u8(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
    329   *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
    330   *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
    331   *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
    332   *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
    333   *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
    334   *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
    335   *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
    336   *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
    337   *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
    338   *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
    339   *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
    340   *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
    341   *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
    342   *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
    343 }
    344 
    345 static INLINE void apply_15_tap_filter_16(
    346     const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
    347     const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
    348     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    349     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    350     const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
    351     const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
    352     uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
    353     uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
    354     uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
    355   uint16x8_t sum0, sum1;
    356   uint8x16_t t;
    357   sum0 = vshll_n_u8(vget_low_u8(p7), 3);    // 8*p7
    358   sum1 = vshll_n_u8(vget_high_u8(p7), 3);   // 8*p7
    359   sum0 = vsubw_u8(sum0, vget_low_u8(p7));   // 7*p7
    360   sum1 = vsubw_u8(sum1, vget_high_u8(p7));  // 7*p7
    361   sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+p6
    362   sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+p6
    363   sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+2*p6
    364   sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+2*p6
    365   sum0 = vaddw_u8(sum0, vget_low_u8(p5));   // 7*p7+2*p6+p5
    366   sum1 = vaddw_u8(sum1, vget_high_u8(p5));  // 7*p7+2*p6+p5
    367   sum0 = vaddw_u8(sum0, vget_low_u8(p4));   // 7*p7+2*p6+p5+p4
    368   sum1 = vaddw_u8(sum1, vget_high_u8(p4));  // 7*p7+2*p6+p5+p4
    369   sum0 = vaddw_u8(sum0, vget_low_u8(p3));   // 7*p7+2*p6+p5+p4+p3
    370   sum1 = vaddw_u8(sum1, vget_high_u8(p3));  // 7*p7+2*p6+p5+p4+p3
    371   sum0 = vaddw_u8(sum0, vget_low_u8(p2));   // 7*p7+2*p6+p5+p4+p3+p2
    372   sum1 = vaddw_u8(sum1, vget_high_u8(p2));  // 7*p7+2*p6+p5+p4+p3+p2
    373   sum0 = vaddw_u8(sum0, vget_low_u8(p1));   // 7*p7+2*p6+p5+p4+p3+p2+p1
    374   sum1 = vaddw_u8(sum1, vget_high_u8(p1));  // 7*p7+2*p6+p5+p4+p3+p2+p1
    375   sum0 = vaddw_u8(sum0, vget_low_u8(p0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
    376   sum1 = vaddw_u8(sum1, vget_high_u8(p0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
    377   sum0 = vaddw_u8(sum0, vget_low_u8(q0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
    378   sum1 = vaddw_u8(sum1, vget_high_u8(q0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
    379   t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
    380   *op6 = vbslq_u8(flat2, t, p6);
    381   *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
    382   *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
    383   *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
    384   *op2 =
    385       apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
    386   *op1 =
    387       apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
    388   *op0 =
    389       apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
    390   *oq0 =
    391       apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
    392   *oq1 =
    393       apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
    394   *oq2 =
    395       apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
    396   *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
    397   *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
    398   *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
    399   *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
    400 }
    401 
    402 #define FUN_FILTER4(w, r)                                                     \
    403   static INLINE void filter4_##w(                                             \
    404       const uint8x##w##_t mask, const uint8x##w##_t hev,                      \
    405       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
    406       const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0,         \
    407       uint8x##w##_t *oq0, uint8x##w##_t *oq1) {                               \
    408     int8x##w##_t filter, filter1, filter2, t;                                 \
    409     int8x##w##_t ps1 = flip_sign_##w(p1);                                     \
    410     int8x##w##_t ps0 = flip_sign_##w(p0);                                     \
    411     int8x##w##_t qs0 = flip_sign_##w(q0);                                     \
    412     int8x##w##_t qs1 = flip_sign_##w(q1);                                     \
    413                                                                               \
    414     /* add outer taps if we have high edge variance */                        \
    415     filter = vqsub##r##s8(ps1, qs1);                                          \
    416     filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
    417     t = vqsub##r##s8(qs0, ps0);                                               \
    418                                                                               \
    419     /* inner taps */                                                          \
    420     filter = vqadd##r##s8(filter, t);                                         \
    421     filter = vqadd##r##s8(filter, t);                                         \
    422     filter = vqadd##r##s8(filter, t);                                         \
    423     filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask));               \
    424                                                                               \
    425     /* save bottom 3 bits so that we round one side +4 and the other +3 */    \
    426     /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
    427     /* we'd round it by 3 the other way */                                    \
    428     filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3);       \
    429     filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3);       \
    430                                                                               \
    431     qs0 = vqsub##r##s8(qs0, filter1);                                         \
    432     ps0 = vqadd##r##s8(ps0, filter2);                                         \
    433     *oq0 = flip_sign_back_##w(qs0);                                           \
    434     *op0 = flip_sign_back_##w(ps0);                                           \
    435                                                                               \
    436     /* outer tap adjustments */                                               \
    437     filter = vrshr##r##n_s8(filter1, 1);                                      \
    438     filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
    439                                                                               \
    440     qs1 = vqsub##r##s8(qs1, filter);                                          \
    441     ps1 = vqadd##r##s8(ps1, filter);                                          \
    442     *oq1 = flip_sign_back_##w(qs1);                                           \
    443     *op1 = flip_sign_back_##w(ps1);                                           \
    444   }
    445 
    446 FUN_FILTER4(8, _)    // filter4_8
    447 FUN_FILTER4(16, q_)  // filter4_16
    448 #undef FUN_FILTER4
    449 
    450 #define FUN_FILTER8(w)                                                         \
    451   static INLINE void filter8_##w(                                              \
    452       const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
    453       const uint32_t flat_status, const uint8x##w##_t hev,                     \
    454       const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1,  \
    455       const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1,  \
    456       const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2,      \
    457       uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
    458       uint8x##w##_t *oq1, uint8x##w##_t *oq2) {                                \
    459     if (flat_status != (uint32_t)-2) {                                         \
    460       filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
    461       *op2 = p2;                                                               \
    462       *oq2 = q2;                                                               \
    463       if (flat_status) {                                                       \
    464         apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
    465                                op0, oq0, oq1, oq2);                            \
    466       }                                                                        \
    467     } else {                                                                   \
    468       calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,     \
    469                             oq0, oq1, oq2);                                    \
    470     }                                                                          \
    471   }
    472 
    473 FUN_FILTER8(8)   // filter8_8
    474 FUN_FILTER8(16)  // filter8_16
    475 #undef FUN_FILTER8
    476 
    477 #define FUN_FILTER16(w)                                                        \
    478   static INLINE void filter16_##w(                                             \
    479       const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
    480       const uint32_t flat_status, const uint8x##w##_t flat2,                   \
    481       const uint32_t flat2_status, const uint8x##w##_t hev,                    \
    482       const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
    483       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
    484       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
    485       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
    486       const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
    487       const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
    488       uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
    489       uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
    490       uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
    491       uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) {            \
    492     if (flat_status != (uint32_t)-2) {                                         \
    493       filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
    494     }                                                                          \
    495                                                                                \
    496     if (flat_status) {                                                         \
    497       *op2 = p2;                                                               \
    498       *oq2 = q2;                                                               \
    499       if (flat2_status != (uint32_t)-2) {                                      \
    500         apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
    501                                op0, oq0, oq1, oq2);                            \
    502       }                                                                        \
    503       if (flat2_status) {                                                      \
    504         apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
    505                                 q2, q3, q4, q5, q6, q7, op6, op5, op4, op3,    \
    506                                 op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
    507                                 oq6);                                          \
    508       }                                                                        \
    509     }                                                                          \
    510   }
    511 
    512 FUN_FILTER16(8)   // filter16_8
    513 FUN_FILTER16(16)  // filter16_16
    514 #undef FUN_FILTER16
    515 
    516 #define FUN_LOAD8(w, r)                                                    \
    517   static INLINE void load_##w##x8(                                         \
    518       const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
    519       uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0,             \
    520       uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) {           \
    521     *p3 = vld1##r##u8(s);                                                  \
    522     s += p;                                                                \
    523     *p2 = vld1##r##u8(s);                                                  \
    524     s += p;                                                                \
    525     *p1 = vld1##r##u8(s);                                                  \
    526     s += p;                                                                \
    527     *p0 = vld1##r##u8(s);                                                  \
    528     s += p;                                                                \
    529     *q0 = vld1##r##u8(s);                                                  \
    530     s += p;                                                                \
    531     *q1 = vld1##r##u8(s);                                                  \
    532     s += p;                                                                \
    533     *q2 = vld1##r##u8(s);                                                  \
    534     s += p;                                                                \
    535     *q3 = vld1##r##u8(s);                                                  \
    536   }
    537 
    538 FUN_LOAD8(8, _)    // load_8x8
    539 FUN_LOAD8(16, q_)  // load_16x8
    540 #undef FUN_LOAD8
    541 
    542 #define FUN_LOAD16(w, r)                                                   \
    543   static INLINE void load_##w##x16(                                        \
    544       const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
    545       uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4,             \
    546       uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7,             \
    547       uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10,            \
    548       uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13,          \
    549       uint8x##w##_t *s14, uint8x##w##_t *s15) {                            \
    550     *s0 = vld1##r##u8(s);                                                  \
    551     s += p;                                                                \
    552     *s1 = vld1##r##u8(s);                                                  \
    553     s += p;                                                                \
    554     *s2 = vld1##r##u8(s);                                                  \
    555     s += p;                                                                \
    556     *s3 = vld1##r##u8(s);                                                  \
    557     s += p;                                                                \
    558     *s4 = vld1##r##u8(s);                                                  \
    559     s += p;                                                                \
    560     *s5 = vld1##r##u8(s);                                                  \
    561     s += p;                                                                \
    562     *s6 = vld1##r##u8(s);                                                  \
    563     s += p;                                                                \
    564     *s7 = vld1##r##u8(s);                                                  \
    565     s += p;                                                                \
    566     *s8 = vld1##r##u8(s);                                                  \
    567     s += p;                                                                \
    568     *s9 = vld1##r##u8(s);                                                  \
    569     s += p;                                                                \
    570     *s10 = vld1##r##u8(s);                                                 \
    571     s += p;                                                                \
    572     *s11 = vld1##r##u8(s);                                                 \
    573     s += p;                                                                \
    574     *s12 = vld1##r##u8(s);                                                 \
    575     s += p;                                                                \
    576     *s13 = vld1##r##u8(s);                                                 \
    577     s += p;                                                                \
    578     *s14 = vld1##r##u8(s);                                                 \
    579     s += p;                                                                \
    580     *s15 = vld1##r##u8(s);                                                 \
    581   }
    582 
    583 FUN_LOAD16(8, _)    // load_8x16
    584 FUN_LOAD16(16, q_)  // load_16x16
    585 #undef FUN_LOAD16
    586 
    587 #define FUN_STORE4(w, r)                                                       \
    588   static INLINE void store_##w##x4(                                            \
    589       uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
    590       const uint8x##w##_t s2, const uint8x##w##_t s3) {                        \
    591     vst1##r##u8(s, s0);                                                        \
    592     s += p;                                                                    \
    593     vst1##r##u8(s, s1);                                                        \
    594     s += p;                                                                    \
    595     vst1##r##u8(s, s2);                                                        \
    596     s += p;                                                                    \
    597     vst1##r##u8(s, s3);                                                        \
    598   }
    599 
    600 FUN_STORE4(8, _)    // store_8x4
    601 FUN_STORE4(16, q_)  // store_16x4
    602 #undef FUN_STORE4
    603 
    604 #define FUN_STORE6(w, r)                                                       \
    605   static INLINE void store_##w##x6(                                            \
    606       uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
    607       const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
    608       const uint8x##w##_t s5) {                                                \
    609     vst1##r##u8(s, s0);                                                        \
    610     s += p;                                                                    \
    611     vst1##r##u8(s, s1);                                                        \
    612     s += p;                                                                    \
    613     vst1##r##u8(s, s2);                                                        \
    614     s += p;                                                                    \
    615     vst1##r##u8(s, s3);                                                        \
    616     s += p;                                                                    \
    617     vst1##r##u8(s, s4);                                                        \
    618     s += p;                                                                    \
    619     vst1##r##u8(s, s5);                                                        \
    620   }
    621 
    622 FUN_STORE6(8, _)    // store_8x6
    623 FUN_STORE6(16, q_)  // store_16x6
    624 #undef FUN_STORE6
    625 
    626 static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
    627                              const uint8x8_t p0, const uint8x8_t q0,
    628                              const uint8x8_t q1) {
    629   uint8x8x4_t o;
    630 
    631   o.val[0] = p1;
    632   o.val[1] = p0;
    633   o.val[2] = q0;
    634   o.val[3] = q1;
    635   vst4_lane_u8(s, o, 0);
    636   s += p;
    637   vst4_lane_u8(s, o, 1);
    638   s += p;
    639   vst4_lane_u8(s, o, 2);
    640   s += p;
    641   vst4_lane_u8(s, o, 3);
    642   s += p;
    643   vst4_lane_u8(s, o, 4);
    644   s += p;
    645   vst4_lane_u8(s, o, 5);
    646   s += p;
    647   vst4_lane_u8(s, o, 6);
    648   s += p;
    649   vst4_lane_u8(s, o, 7);
    650 }
    651 
    652 static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
    653                              const uint8x8_t s1, const uint8x8_t s2,
    654                              const uint8x8_t s3, const uint8x8_t s4,
    655                              const uint8x8_t s5) {
    656   uint8x8x3_t o0, o1;
    657 
    658   o0.val[0] = s0;
    659   o0.val[1] = s1;
    660   o0.val[2] = s2;
    661   o1.val[0] = s3;
    662   o1.val[1] = s4;
    663   o1.val[2] = s5;
    664   vst3_lane_u8(s - 3, o0, 0);
    665   vst3_lane_u8(s + 0, o1, 0);
    666   s += p;
    667   vst3_lane_u8(s - 3, o0, 1);
    668   vst3_lane_u8(s + 0, o1, 1);
    669   s += p;
    670   vst3_lane_u8(s - 3, o0, 2);
    671   vst3_lane_u8(s + 0, o1, 2);
    672   s += p;
    673   vst3_lane_u8(s - 3, o0, 3);
    674   vst3_lane_u8(s + 0, o1, 3);
    675   s += p;
    676   vst3_lane_u8(s - 3, o0, 4);
    677   vst3_lane_u8(s + 0, o1, 4);
    678   s += p;
    679   vst3_lane_u8(s - 3, o0, 5);
    680   vst3_lane_u8(s + 0, o1, 5);
    681   s += p;
    682   vst3_lane_u8(s - 3, o0, 6);
    683   vst3_lane_u8(s + 0, o1, 6);
    684   s += p;
    685   vst3_lane_u8(s - 3, o0, 7);
    686   vst3_lane_u8(s + 0, o1, 7);
    687 }
    688 
    689 #define FUN_STORE8(w, r)                                                       \
    690   static INLINE void store_##w##x8(                                            \
    691       uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
    692       const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
    693       const uint8x##w##_t s5, const uint8x##w##_t s6,                          \
    694       const uint8x##w##_t s7) {                                                \
    695     vst1##r##u8(s, s0);                                                        \
    696     s += p;                                                                    \
    697     vst1##r##u8(s, s1);                                                        \
    698     s += p;                                                                    \
    699     vst1##r##u8(s, s2);                                                        \
    700     s += p;                                                                    \
    701     vst1##r##u8(s, s3);                                                        \
    702     s += p;                                                                    \
    703     vst1##r##u8(s, s4);                                                        \
    704     s += p;                                                                    \
    705     vst1##r##u8(s, s5);                                                        \
    706     s += p;                                                                    \
    707     vst1##r##u8(s, s6);                                                        \
    708     s += p;                                                                    \
    709     vst1##r##u8(s, s7);                                                        \
    710   }
    711 
    712 FUN_STORE8(8, _)    // store_8x8
    713 FUN_STORE8(16, q_)  // store_16x8
    714 #undef FUN_STORE8
    715 
    716 #define FUN_STORE14(w, r)                                                      \
    717   static INLINE void store_##w##x14(                                           \
    718       uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
    719       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
    720       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
    721       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
    722       const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
    723       const uint32_t flat_status, const uint32_t flat2_status) {               \
    724     if (flat_status) {                                                         \
    725       if (flat2_status) {                                                      \
    726         vst1##r##u8(s - 7 * p, p6);                                            \
    727         vst1##r##u8(s - 6 * p, p5);                                            \
    728         vst1##r##u8(s - 5 * p, p4);                                            \
    729         vst1##r##u8(s - 4 * p, p3);                                            \
    730         vst1##r##u8(s + 3 * p, q3);                                            \
    731         vst1##r##u8(s + 4 * p, q4);                                            \
    732         vst1##r##u8(s + 5 * p, q5);                                            \
    733         vst1##r##u8(s + 6 * p, q6);                                            \
    734       }                                                                        \
    735       vst1##r##u8(s - 3 * p, p2);                                              \
    736       vst1##r##u8(s + 2 * p, q2);                                              \
    737     }                                                                          \
    738     vst1##r##u8(s - 2 * p, p1);                                                \
    739     vst1##r##u8(s - 1 * p, p0);                                                \
    740     vst1##r##u8(s + 0 * p, q0);                                                \
    741     vst1##r##u8(s + 1 * p, q1);                                                \
    742   }
    743 
    744 FUN_STORE14(8, _)    // store_8x14
    745 FUN_STORE14(16, q_)  // store_16x14
    746 #undef FUN_STORE14
    747 
    748 static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
    749                                const uint8x16_t s1, const uint8x16_t s2,
    750                                const uint8x16_t s3, const uint8x16_t s4,
    751                                const uint8x16_t s5, const uint8x16_t s6,
    752                                const uint8x16_t s7, const uint8x16_t s8,
    753                                const uint8x16_t s9, const uint8x16_t s10,
    754                                const uint8x16_t s11, const uint8x16_t s12,
    755                                const uint8x16_t s13, const uint8x16_t s14,
    756                                const uint8x16_t s15) {
    757   vst1q_u8(s, s0);
    758   s += p;
    759   vst1q_u8(s, s1);
    760   s += p;
    761   vst1q_u8(s, s2);
    762   s += p;
    763   vst1q_u8(s, s3);
    764   s += p;
    765   vst1q_u8(s, s4);
    766   s += p;
    767   vst1q_u8(s, s5);
    768   s += p;
    769   vst1q_u8(s, s6);
    770   s += p;
    771   vst1q_u8(s, s7);
    772   s += p;
    773   vst1q_u8(s, s8);
    774   s += p;
    775   vst1q_u8(s, s9);
    776   s += p;
    777   vst1q_u8(s, s10);
    778   s += p;
    779   vst1q_u8(s, s11);
    780   s += p;
    781   vst1q_u8(s, s12);
    782   s += p;
    783   vst1q_u8(s, s13);
    784   s += p;
    785   vst1q_u8(s, s14);
    786   s += p;
    787   vst1q_u8(s, s15);
    788 }
    789 
    790 #define FUN_HOR_4_KERNEL(name, w)                                           \
    791   static INLINE void lpf_horizontal_4##name##kernel(                        \
    792       uint8_t *s, const int p, const uint8x##w##_t blimit,                  \
    793       const uint8x##w##_t limit, const uint8x##w##_t thresh) {              \
    794     uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev;                \
    795                                                                             \
    796     load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);     \
    797     filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
    798                          q3, &hev, &mask);                                  \
    799     filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);             \
    800     store_##w##x4(s - 2 * p, p, p1, p0, q0, q1);                            \
    801   }
    802 
    803 FUN_HOR_4_KERNEL(_, 8)        // lpf_horizontal_4_kernel
    804 FUN_HOR_4_KERNEL(_dual_, 16)  // lpf_horizontal_4_dual_kernel
    805 #undef FUN_HOR_4_KERNEL
    806 
    807 void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
    808                                const uint8_t *limit, const uint8_t *thresh) {
    809   uint8x8_t blimit_vec, limit_vec, thresh_vec;
    810   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
    811   lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
    812 }
    813 
    814 void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
    815                                     const uint8_t *limit0,
    816                                     const uint8_t *thresh0,
    817                                     const uint8_t *blimit1,
    818                                     const uint8_t *limit1,
    819                                     const uint8_t *thresh1) {
    820   uint8x16_t blimit_vec, limit_vec, thresh_vec;
    821   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
    822                      &blimit_vec, &limit_vec, &thresh_vec);
    823   lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
    824 }
    825 
    826 void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
    827                              const uint8_t *limit, const uint8_t *thresh) {
    828   uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    829       mask, hev;
    830   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
    831   load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    832   transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    833   filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
    834                      q2, q3, &hev, &mask);
    835   filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
    836   store_4x8(s - 2, p, p1, p0, q0, q1);
    837 }
    838 
    839 void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
    840                                   const uint8_t *limit0, const uint8_t *thresh0,
    841                                   const uint8_t *blimit1, const uint8_t *limit1,
    842                                   const uint8_t *thresh1) {
    843   uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    844       mask, hev;
    845   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
    846       s15;
    847 
    848   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
    849                      &blimit_vec, &limit_vec, &thresh_vec);
    850   load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
    851             &s11, &s12, &s13, &s14, &s15);
    852   transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
    853                     s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    854   filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
    855                       q2, q3, &hev, &mask);
    856   filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
    857   s -= 2;
    858   store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
    859             vget_low_u8(q1));
    860   store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
    861             vget_high_u8(q1));
    862 }
    863 
    864 void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
    865                                const uint8_t *limit, const uint8_t *thresh) {
    866   uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    867       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
    868   uint32_t flat_status;
    869 
    870   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
    871   load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    872   mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
    873                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
    874   filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
    875             &op1, &op0, &oq0, &oq1, &oq2);
    876   store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
    877 }
    878 
    879 void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
    880                                     const uint8_t *limit0,
    881                                     const uint8_t *thresh0,
    882                                     const uint8_t *blimit1,
    883                                     const uint8_t *limit1,
    884                                     const uint8_t *thresh1) {
    885   uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    886       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
    887   uint32_t flat_status;
    888 
    889   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
    890                      &blimit_vec, &limit_vec, &thresh_vec);
    891   load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    892   mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
    893                                  p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
    894   filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
    895              &op1, &op0, &oq0, &oq1, &oq2);
    896   store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
    897 }
    898 
    899 void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
    900                              const uint8_t *limit, const uint8_t *thresh) {
    901   uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    902       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
    903   uint32_t flat_status;
    904 
    905   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
    906   load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    907   transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    908   mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
    909                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
    910   filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
    911             &op1, &op0, &oq0, &oq1, &oq2);
    912   // Note: transpose + store_8x8() is faster than store_6x8().
    913   transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
    914   store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
    915 }
    916 
    917 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
    918                                   const uint8_t *limit0, const uint8_t *thresh0,
    919                                   const uint8_t *blimit1, const uint8_t *limit1,
    920                                   const uint8_t *thresh1) {
    921   uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    922       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
    923   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
    924       s15;
    925   uint32_t flat_status;
    926 
    927   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
    928                      &blimit_vec, &limit_vec, &thresh_vec);
    929   load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
    930             &s11, &s12, &s13, &s14, &s15);
    931   transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
    932                     s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    933   mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
    934                                  p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
    935   filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
    936              &op1, &op0, &oq0, &oq1, &oq2);
    937   // Note: store_6x8() twice is faster than transpose + store_8x16().
    938   store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
    939             vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
    940   store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
    941             vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
    942             vget_high_u8(oq2));
    943 }
    944 
    945 #define FUN_LPF_16_KERNEL(name, w)                                             \
    946   static INLINE void lpf_16##name##kernel(                                     \
    947       const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh,      \
    948       const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
    949       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
    950       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
    951       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
    952       const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
    953       const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
    954       uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
    955       uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
    956       uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
    957       uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6,              \
    958       uint32_t *flat_status, uint32_t *flat2_status) {                         \
    959     uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev;   \
    960                                                                                \
    961     load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec,            \
    962                     &thresh_vec);                                              \
    963     mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
    964                                     p1, p0, q0, q1, q2, q3, &flat,             \
    965                                     flat_status, &hev);                        \
    966     flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,       \
    967                            flat2_status);                                      \
    968     filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6,  \
    969                  p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,  \
    970                  op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
    971                  oq6);                                                         \
    972   }
    973 
    974 FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
    975 FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
    976 #undef FUN_LPF_16_KERNEL
    977 
    978 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
    979                                 const uint8_t *limit, const uint8_t *thresh) {
    980   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
    981       op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
    982   uint32_t flat_status, flat2_status;
    983 
    984   load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
    985             &q3, &q4, &q5, &q6, &q7);
    986   lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
    987                 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
    988                 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
    989                 &flat2_status);
    990   store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
    991              oq5, oq6, flat_status, flat2_status);
    992 }
    993 
    994 void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
    995                                      const uint8_t *limit,
    996                                      const uint8_t *thresh) {
    997   uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
    998       op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
    999   uint32_t flat_status, flat2_status;
   1000 
   1001   load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   1002   p7 = vld1q_u8(s - 8 * p);
   1003   p6 = vld1q_u8(s - 7 * p);
   1004   p5 = vld1q_u8(s - 6 * p);
   1005   p4 = vld1q_u8(s - 5 * p);
   1006   q4 = vld1q_u8(s + 4 * p);
   1007   q5 = vld1q_u8(s + 5 * p);
   1008   q6 = vld1q_u8(s + 6 * p);
   1009   q7 = vld1q_u8(s + 7 * p);
   1010   lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
   1011                      q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
   1012                      &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
   1013                      &flat_status, &flat2_status);
   1014   store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
   1015               oq5, oq6, flat_status, flat2_status);
   1016 }
   1017 
   1018 void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
   1019                               const uint8_t *limit, const uint8_t *thresh) {
   1020   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
   1021       op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
   1022   uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
   1023   uint32_t flat_status, flat2_status;
   1024 
   1025   s -= 8;
   1026   load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
   1027   transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
   1028                     &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
   1029   lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
   1030                 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
   1031                 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
   1032                 &flat2_status);
   1033   if (flat_status) {
   1034     if (flat2_status) {
   1035       transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
   1036                         oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
   1037                         &s6, &s7);
   1038       store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
   1039     } else {
   1040       // Note: transpose + store_8x8() is faster than store_6x8().
   1041       transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
   1042       store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
   1043     }
   1044   } else {
   1045     store_4x8(s + 6, p, op1, op0, oq0, oq1);
   1046   }
   1047 }
   1048 
   1049 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
   1050                                    const uint8_t *limit,
   1051                                    const uint8_t *thresh) {
   1052   uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
   1053       op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
   1054   uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
   1055       s15;
   1056   uint32_t flat_status, flat2_status;
   1057 
   1058   s -= 8;
   1059   load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
   1060              &s12, &s13, &s14, &s15);
   1061   transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
   1062                      s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
   1063                      &q2, &q3, &q4, &q5, &q6, &q7);
   1064   lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
   1065                      q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
   1066                      &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
   1067                      &flat_status, &flat2_status);
   1068   if (flat_status) {
   1069     if (flat2_status) {
   1070       transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
   1071                          oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
   1072                          &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
   1073                          &s15);
   1074       store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
   1075                   s13, s14, s15);
   1076     } else {
   1077       // Note: store_6x8() twice is faster than transpose + store_8x16().
   1078       s += 8;
   1079       store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
   1080                 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
   1081       store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
   1082                 vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
   1083                 vget_high_u8(oq2));
   1084     }
   1085   } else {
   1086     s += 6;
   1087     store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
   1088               vget_low_u8(oq1));
   1089     store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
   1090               vget_high_u8(oq0), vget_high_u8(oq1));
   1091   }
   1092 }
   1093