Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include "./vpx_config.h"
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/arm/transpose_neon.h"
     15 
     16 static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
     17                                const uint8_t *thresh, uint16x8_t *blimit_vec,
     18                                uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
     19                                const int bd) {
     20   const int16x8_t shift = vdupq_n_s16(bd - 8);
     21   *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
     22   *limit_vec = vmovl_u8(vld1_dup_u8(limit));
     23   *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
     24   *blimit_vec = vshlq_u16(*blimit_vec, shift);
     25   *limit_vec = vshlq_u16(*limit_vec, shift);
     26   *thresh_vec = vshlq_u16(*thresh_vec, shift);
     27 }
     28 
     29 // Here flat is 128-bit long, with each 16-bit chunk being a mask of
     30 // a pixel. When used to control filter branches, we only detect whether it is
     31 // all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
     32 // flat equals 0 if and only if flat_status equals 0.
     33 // flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true
     34 // because each mask occupies more than 1 bit.)
     35 static INLINE uint32_t calc_flat_status(const uint16x8_t flat) {
     36   const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)),
     37                                  vreinterpret_u64_u16(vget_high_u16(flat)));
     38   const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0));
     39   return vget_lane_u32(vreinterpret_u32_u64(t1), 0);
     40 }
     41 
     42 static INLINE uint16x8_t
     43 filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
     44                  const uint16x8_t thresh, const uint16x8_t p3,
     45                  const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
     46                  const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
     47                  const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
     48   uint16x8_t max, t0, t1;
     49 
     50   max = vabdq_u16(p1, p0);
     51   max = vmaxq_u16(max, vabdq_u16(q1, q0));
     52   *hev = vcgtq_u16(max, thresh);
     53   *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
     54   *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
     55   *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
     56   *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
     57   t0 = vabdq_u16(p0, q0);
     58   t1 = vabdq_u16(p1, q1);
     59   t0 = vaddq_u16(t0, t0);
     60   t1 = vshrq_n_u16(t1, 1);
     61   t0 = vaddq_u16(t0, t1);
     62   *mask = vcleq_u16(*mask, limit);
     63   t0 = vcleq_u16(t0, blimit);
     64   *mask = vandq_u16(*mask, t0);
     65 
     66   return max;
     67 }
     68 
     69 static INLINE uint16x8_t filter_flat_hev_mask(
     70     const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh,
     71     const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1,
     72     const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1,
     73     const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat,
     74     uint32_t *flat_status, uint16x8_t *hev, const int bd) {
     75   uint16x8_t mask;
     76   const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0,
     77                                           q0, q1, q2, q3, hev, &mask);
     78   *flat = vmaxq_u16(max, vabdq_u16(p2, p0));
     79   *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0));
     80   *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0));
     81   *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0));
     82   *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */
     83   *flat = vandq_u16(*flat, mask);
     84   *flat_status = calc_flat_status(*flat);
     85 
     86   return mask;
     87 }
     88 
     89 static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3,
     90                                     const uint16x8_t p2, const uint16x8_t p1,
     91                                     const uint16x8_t p0, const uint16x8_t q0,
     92                                     const uint16x8_t q1, const uint16x8_t q2,
     93                                     const uint16x8_t q3, const uint16x8_t q4,
     94                                     const uint16x8_t flat,
     95                                     uint32_t *flat2_status, const int bd) {
     96   uint16x8_t flat2 = vabdq_u16(p4, p0);
     97   flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0));
     98   flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0));
     99   flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0));
    100   flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0));
    101   flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0));
    102   flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0));
    103   flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0));
    104   flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8)));
    105   flat2 = vandq_u16(flat2, flat);
    106   *flat2_status = calc_flat_status(flat2);
    107 
    108   return flat2;
    109 }
    110 
    111 static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
    112   const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
    113   return vreinterpretq_s16_u16(vsubq_u16(v, offset));
    114 }
    115 
    116 static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
    117   const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
    118   return vreinterpretq_u16_s16(vaddq_s16(v, offset));
    119 }
    120 
    121 static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1,
    122                                  const uint16x8_t add0, const uint16x8_t add1,
    123                                  uint16x8_t *sum) {
    124   *sum = vsubq_u16(*sum, sub0);
    125   *sum = vsubq_u16(*sum, sub1);
    126   *sum = vaddq_u16(*sum, add0);
    127   *sum = vaddq_u16(*sum, add1);
    128 }
    129 
    130 static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0,
    131                                                   const uint16x8_t sub1,
    132                                                   const uint16x8_t add0,
    133                                                   const uint16x8_t add1,
    134                                                   uint16x8_t *sum) {
    135   filter_update(sub0, sub1, add0, add1, sum);
    136   return vrshrq_n_u16(*sum, 3);
    137 }
    138 
    139 static INLINE uint16x8_t apply_15_tap_filter_kernel(
    140     const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1,
    141     const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in,
    142     uint16x8_t *sum) {
    143   filter_update(sub0, sub1, add0, add1, sum);
    144   return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in);
    145 }
    146 
    147 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
    148 static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2,
    149                                      const uint16x8_t p1, const uint16x8_t p0,
    150                                      const uint16x8_t q0, const uint16x8_t q1,
    151                                      const uint16x8_t q2, const uint16x8_t q3,
    152                                      uint16x8_t *op2, uint16x8_t *op1,
    153                                      uint16x8_t *op0, uint16x8_t *oq0,
    154                                      uint16x8_t *oq1, uint16x8_t *oq2) {
    155   uint16x8_t sum;
    156   sum = vaddq_u16(p3, p3);   // 2*p3
    157   sum = vaddq_u16(sum, p3);  // 3*p3
    158   sum = vaddq_u16(sum, p2);  // 3*p3+p2
    159   sum = vaddq_u16(sum, p2);  // 3*p3+2*p2
    160   sum = vaddq_u16(sum, p1);  // 3*p3+2*p2+p1
    161   sum = vaddq_u16(sum, p0);  // 3*p3+2*p2+p1+p0
    162   sum = vaddq_u16(sum, q0);  // 3*p3+2*p2+p1+p0+q0
    163   *op2 = vrshrq_n_u16(sum, 3);
    164   *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum);
    165   *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum);
    166   *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum);
    167   *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum);
    168   *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum);
    169 }
    170 
    171 static INLINE void apply_7_tap_filter(const uint16x8_t flat,
    172                                       const uint16x8_t p3, const uint16x8_t p2,
    173                                       const uint16x8_t p1, const uint16x8_t p0,
    174                                       const uint16x8_t q0, const uint16x8_t q1,
    175                                       const uint16x8_t q2, const uint16x8_t q3,
    176                                       uint16x8_t *op2, uint16x8_t *op1,
    177                                       uint16x8_t *op0, uint16x8_t *oq0,
    178                                       uint16x8_t *oq1, uint16x8_t *oq2) {
    179   uint16x8_t tp1, tp0, tq0, tq1;
    180   calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1,
    181                     oq2);
    182   *op2 = vbslq_u16(flat, *op2, p2);
    183   *op1 = vbslq_u16(flat, tp1, *op1);
    184   *op0 = vbslq_u16(flat, tp0, *op0);
    185   *oq0 = vbslq_u16(flat, tq0, *oq0);
    186   *oq1 = vbslq_u16(flat, tq1, *oq1);
    187   *oq2 = vbslq_u16(flat, *oq2, q2);
    188 }
    189 
    190 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
    191 static INLINE void apply_15_tap_filter(
    192     const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6,
    193     const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3,
    194     const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
    195     const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
    196     const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5,
    197     const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5,
    198     uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1,
    199     uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
    200     uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) {
    201   uint16x8_t sum;
    202   sum = vshlq_n_u16(p7, 3);  // 8*p7
    203   sum = vsubq_u16(sum, p7);  // 7*p7
    204   sum = vaddq_u16(sum, p6);  // 7*p7+p6
    205   sum = vaddq_u16(sum, p6);  // 7*p7+2*p6
    206   sum = vaddq_u16(sum, p5);  // 7*p7+2*p6+p5
    207   sum = vaddq_u16(sum, p4);  // 7*p7+2*p6+p5+p4
    208   sum = vaddq_u16(sum, p3);  // 7*p7+2*p6+p5+p4+p3
    209   sum = vaddq_u16(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
    210   sum = vaddq_u16(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
    211   sum = vaddq_u16(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
    212   sum = vaddq_u16(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
    213   *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6);
    214   *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum);
    215   *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum);
    216   *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum);
    217   *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
    218   *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
    219   *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
    220   *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
    221   *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
    222   *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
    223   *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum);
    224   *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum);
    225   *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum);
    226   *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum);
    227 }
    228 
    229 static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
    230                            const uint16x8_t p1, const uint16x8_t p0,
    231                            const uint16x8_t q0, const uint16x8_t q1,
    232                            uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
    233                            uint16x8_t *oq1, const int bd) {
    234   const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
    235   const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
    236   int16x8_t filter, filter1, filter2, t;
    237   int16x8_t ps1 = flip_sign(p1, bd);
    238   int16x8_t ps0 = flip_sign(p0, bd);
    239   int16x8_t qs0 = flip_sign(q0, bd);
    240   int16x8_t qs1 = flip_sign(q1, bd);
    241 
    242   /* add outer taps if we have high edge variance */
    243   filter = vsubq_s16(ps1, qs1);
    244   filter = vmaxq_s16(filter, min);
    245   filter = vminq_s16(filter, max);
    246   filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
    247   t = vsubq_s16(qs0, ps0);
    248 
    249   /* inner taps */
    250   filter = vaddq_s16(filter, t);
    251   filter = vaddq_s16(filter, t);
    252   filter = vaddq_s16(filter, t);
    253   filter = vmaxq_s16(filter, min);
    254   filter = vminq_s16(filter, max);
    255   filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
    256 
    257   /* save bottom 3 bits so that we round one side +4 and the other +3 */
    258   /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
    259   /* we'd round it by 3 the other way */
    260   t = vaddq_s16(filter, vdupq_n_s16(4));
    261   t = vminq_s16(t, max);
    262   filter1 = vshrq_n_s16(t, 3);
    263   t = vaddq_s16(filter, vdupq_n_s16(3));
    264   t = vminq_s16(t, max);
    265   filter2 = vshrq_n_s16(t, 3);
    266 
    267   qs0 = vsubq_s16(qs0, filter1);
    268   qs0 = vmaxq_s16(qs0, min);
    269   qs0 = vminq_s16(qs0, max);
    270   ps0 = vaddq_s16(ps0, filter2);
    271   ps0 = vmaxq_s16(ps0, min);
    272   ps0 = vminq_s16(ps0, max);
    273   *oq0 = flip_sign_back(qs0, bd);
    274   *op0 = flip_sign_back(ps0, bd);
    275 
    276   /* outer tap adjustments */
    277   filter = vrshrq_n_s16(filter1, 1);
    278   filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
    279 
    280   qs1 = vsubq_s16(qs1, filter);
    281   qs1 = vmaxq_s16(qs1, min);
    282   qs1 = vminq_s16(qs1, max);
    283   ps1 = vaddq_s16(ps1, filter);
    284   ps1 = vmaxq_s16(ps1, min);
    285   ps1 = vminq_s16(ps1, max);
    286   *oq1 = flip_sign_back(qs1, bd);
    287   *op1 = flip_sign_back(ps1, bd);
    288 }
    289 
    290 static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat,
    291                            const uint32_t flat_status, const uint16x8_t hev,
    292                            const uint16x8_t p3, const uint16x8_t p2,
    293                            const uint16x8_t p1, const uint16x8_t p0,
    294                            const uint16x8_t q0, const uint16x8_t q1,
    295                            const uint16x8_t q2, const uint16x8_t q3,
    296                            uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
    297                            uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
    298                            const int bd) {
    299   if (flat_status != (uint32_t)-4) {
    300     filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
    301     *op2 = p2;
    302     *oq2 = q2;
    303     if (flat_status) {
    304       apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
    305                          oq0, oq1, oq2);
    306     }
    307   } else {
    308     calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1,
    309                       oq2);
    310   }
    311 }
    312 
    313 static INLINE void filter16(
    314     const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status,
    315     const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev,
    316     const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5,
    317     const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2,
    318     const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0,
    319     const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3,
    320     const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6,
    321     const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4,
    322     uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
    323     uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3,
    324     uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) {
    325   if (flat_status != (uint32_t)-4) {
    326     filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
    327   }
    328 
    329   if (flat_status) {
    330     *op2 = p2;
    331     *oq2 = q2;
    332     if (flat2_status != (uint32_t)-4) {
    333       apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
    334                          oq0, oq1, oq2);
    335     }
    336     if (flat2_status) {
    337       apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
    338                           q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
    339                           oq0, oq1, oq2, oq3, oq4, oq5, oq6);
    340     }
    341   }
    342 }
    343 
    344 static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
    345                             uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
    346                             uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
    347                             uint16x8_t *q3) {
    348   *p3 = vld1q_u16(s);
    349   s += p;
    350   *p2 = vld1q_u16(s);
    351   s += p;
    352   *p1 = vld1q_u16(s);
    353   s += p;
    354   *p0 = vld1q_u16(s);
    355   s += p;
    356   *q0 = vld1q_u16(s);
    357   s += p;
    358   *q1 = vld1q_u16(s);
    359   s += p;
    360   *q2 = vld1q_u16(s);
    361   s += p;
    362   *q3 = vld1q_u16(s);
    363 }
    364 
    365 static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0,
    366                              uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3,
    367                              uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6,
    368                              uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9,
    369                              uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12,
    370                              uint16x8_t *s13, uint16x8_t *s14,
    371                              uint16x8_t *s15) {
    372   *s0 = vld1q_u16(s);
    373   s += p;
    374   *s1 = vld1q_u16(s);
    375   s += p;
    376   *s2 = vld1q_u16(s);
    377   s += p;
    378   *s3 = vld1q_u16(s);
    379   s += p;
    380   *s4 = vld1q_u16(s);
    381   s += p;
    382   *s5 = vld1q_u16(s);
    383   s += p;
    384   *s6 = vld1q_u16(s);
    385   s += p;
    386   *s7 = vld1q_u16(s);
    387   s += p;
    388   *s8 = vld1q_u16(s);
    389   s += p;
    390   *s9 = vld1q_u16(s);
    391   s += p;
    392   *s10 = vld1q_u16(s);
    393   s += p;
    394   *s11 = vld1q_u16(s);
    395   s += p;
    396   *s12 = vld1q_u16(s);
    397   s += p;
    398   *s13 = vld1q_u16(s);
    399   s += p;
    400   *s14 = vld1q_u16(s);
    401   s += p;
    402   *s15 = vld1q_u16(s);
    403 }
    404 
    405 static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
    406                              const uint16x8_t s1, const uint16x8_t s2,
    407                              const uint16x8_t s3) {
    408   vst1q_u16(s, s0);
    409   s += p;
    410   vst1q_u16(s, s1);
    411   s += p;
    412   vst1q_u16(s, s2);
    413   s += p;
    414   vst1q_u16(s, s3);
    415 }
    416 
    417 static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0,
    418                              const uint16x8_t s1, const uint16x8_t s2,
    419                              const uint16x8_t s3, const uint16x8_t s4,
    420                              const uint16x8_t s5) {
    421   vst1q_u16(s, s0);
    422   s += p;
    423   vst1q_u16(s, s1);
    424   s += p;
    425   vst1q_u16(s, s2);
    426   s += p;
    427   vst1q_u16(s, s3);
    428   s += p;
    429   vst1q_u16(s, s4);
    430   s += p;
    431   vst1q_u16(s, s5);
    432 }
    433 
    434 static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1,
    435                              const uint16x8_t p0, const uint16x8_t q0,
    436                              const uint16x8_t q1) {
    437   uint16x8x4_t o;
    438 
    439   o.val[0] = p1;
    440   o.val[1] = p0;
    441   o.val[2] = q0;
    442   o.val[3] = q1;
    443   vst4q_lane_u16(s, o, 0);
    444   s += p;
    445   vst4q_lane_u16(s, o, 1);
    446   s += p;
    447   vst4q_lane_u16(s, o, 2);
    448   s += p;
    449   vst4q_lane_u16(s, o, 3);
    450   s += p;
    451   vst4q_lane_u16(s, o, 4);
    452   s += p;
    453   vst4q_lane_u16(s, o, 5);
    454   s += p;
    455   vst4q_lane_u16(s, o, 6);
    456   s += p;
    457   vst4q_lane_u16(s, o, 7);
    458 }
    459 
    460 static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0,
    461                              const uint16x8_t s1, const uint16x8_t s2,
    462                              const uint16x8_t s3, const uint16x8_t s4,
    463                              const uint16x8_t s5) {
    464   uint16x8x3_t o0, o1;
    465 
    466   o0.val[0] = s0;
    467   o0.val[1] = s1;
    468   o0.val[2] = s2;
    469   o1.val[0] = s3;
    470   o1.val[1] = s4;
    471   o1.val[2] = s5;
    472   vst3q_lane_u16(s - 3, o0, 0);
    473   vst3q_lane_u16(s + 0, o1, 0);
    474   s += p;
    475   vst3q_lane_u16(s - 3, o0, 1);
    476   vst3q_lane_u16(s + 0, o1, 1);
    477   s += p;
    478   vst3q_lane_u16(s - 3, o0, 2);
    479   vst3q_lane_u16(s + 0, o1, 2);
    480   s += p;
    481   vst3q_lane_u16(s - 3, o0, 3);
    482   vst3q_lane_u16(s + 0, o1, 3);
    483   s += p;
    484   vst3q_lane_u16(s - 3, o0, 4);
    485   vst3q_lane_u16(s + 0, o1, 4);
    486   s += p;
    487   vst3q_lane_u16(s - 3, o0, 5);
    488   vst3q_lane_u16(s + 0, o1, 5);
    489   s += p;
    490   vst3q_lane_u16(s - 3, o0, 6);
    491   vst3q_lane_u16(s + 0, o1, 6);
    492   s += p;
    493   vst3q_lane_u16(s - 3, o0, 7);
    494   vst3q_lane_u16(s + 0, o1, 7);
    495 }
    496 
    497 static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0,
    498                              const uint16x8_t s1, const uint16x8_t s2,
    499                              const uint16x8_t s3, const uint16x8_t s4,
    500                              const uint16x8_t s5, const uint16x8_t s6) {
    501   uint16x8x4_t o0;
    502   uint16x8x3_t o1;
    503 
    504   o0.val[0] = s0;
    505   o0.val[1] = s1;
    506   o0.val[2] = s2;
    507   o0.val[3] = s3;
    508   o1.val[0] = s4;
    509   o1.val[1] = s5;
    510   o1.val[2] = s6;
    511   vst4q_lane_u16(s - 4, o0, 0);
    512   vst3q_lane_u16(s + 0, o1, 0);
    513   s += p;
    514   vst4q_lane_u16(s - 4, o0, 1);
    515   vst3q_lane_u16(s + 0, o1, 1);
    516   s += p;
    517   vst4q_lane_u16(s - 4, o0, 2);
    518   vst3q_lane_u16(s + 0, o1, 2);
    519   s += p;
    520   vst4q_lane_u16(s - 4, o0, 3);
    521   vst3q_lane_u16(s + 0, o1, 3);
    522   s += p;
    523   vst4q_lane_u16(s - 4, o0, 4);
    524   vst3q_lane_u16(s + 0, o1, 4);
    525   s += p;
    526   vst4q_lane_u16(s - 4, o0, 5);
    527   vst3q_lane_u16(s + 0, o1, 5);
    528   s += p;
    529   vst4q_lane_u16(s - 4, o0, 6);
    530   vst3q_lane_u16(s + 0, o1, 6);
    531   s += p;
    532   vst4q_lane_u16(s - 4, o0, 7);
    533   vst3q_lane_u16(s + 0, o1, 7);
    534 }
    535 
    536 static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6,
    537                               const uint16x8_t p5, const uint16x8_t p4,
    538                               const uint16x8_t p3, const uint16x8_t p2,
    539                               const uint16x8_t p1, const uint16x8_t p0,
    540                               const uint16x8_t q0, const uint16x8_t q1,
    541                               const uint16x8_t q2, const uint16x8_t q3,
    542                               const uint16x8_t q4, const uint16x8_t q5,
    543                               const uint16x8_t q6, const uint32_t flat_status,
    544                               const uint32_t flat2_status) {
    545   if (flat_status) {
    546     if (flat2_status) {
    547       vst1q_u16(s - 7 * p, p6);
    548       vst1q_u16(s - 6 * p, p5);
    549       vst1q_u16(s - 5 * p, p4);
    550       vst1q_u16(s - 4 * p, p3);
    551       vst1q_u16(s + 3 * p, q3);
    552       vst1q_u16(s + 4 * p, q4);
    553       vst1q_u16(s + 5 * p, q5);
    554       vst1q_u16(s + 6 * p, q6);
    555     }
    556     vst1q_u16(s - 3 * p, p2);
    557     vst1q_u16(s + 2 * p, q2);
    558   }
    559   vst1q_u16(s - 2 * p, p1);
    560   vst1q_u16(s - 1 * p, p0);
    561   vst1q_u16(s + 0 * p, q0);
    562   vst1q_u16(s + 1 * p, q1);
    563 }
    564 
    565 void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
    566                                       const uint8_t *limit,
    567                                       const uint8_t *thresh, int bd) {
    568   uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    569       mask, hev;
    570 
    571   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    572   load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    573   filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
    574                    q2, q3, &hev, &mask);
    575   filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
    576   store_8x4(s - 2 * p, p, p1, p0, q0, q1);
    577 }
    578 
    579 void vpx_highbd_lpf_horizontal_4_dual_neon(
    580     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    581     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    582     const uint8_t *thresh1, int bd) {
    583   vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
    584   vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
    585 }
    586 
    587 void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit,
    588                                     const uint8_t *limit, const uint8_t *thresh,
    589                                     int bd) {
    590   uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    591       mask, hev;
    592 
    593   load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    594   transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
    595                     (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
    596                     (int16x8_t *)&q2, (int16x8_t *)&q3);
    597   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    598   filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
    599                    q2, q3, &hev, &mask);
    600   filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
    601   store_4x8(s - 2, p, p1, p0, q0, q1);
    602 }
    603 
    604 void vpx_highbd_lpf_vertical_4_dual_neon(
    605     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    606     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    607     const uint8_t *thresh1, int bd) {
    608   vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd);
    609   vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
    610 }
    611 
    612 void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit,
    613                                       const uint8_t *limit,
    614                                       const uint8_t *thresh, int bd) {
    615   uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    616       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
    617   uint32_t flat_status;
    618 
    619   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    620   load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    621   mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
    622                               q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
    623   filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
    624           &op1, &op0, &oq0, &oq1, &oq2, bd);
    625   store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
    626 }
    627 
    628 void vpx_highbd_lpf_horizontal_8_dual_neon(
    629     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    630     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    631     const uint8_t *thresh1, int bd) {
    632   vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd);
    633   vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd);
    634 }
    635 
    636 void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit,
    637                                     const uint8_t *limit, const uint8_t *thresh,
    638                                     int bd) {
    639   uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
    640       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
    641   uint32_t flat_status;
    642 
    643   load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    644   transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
    645                     (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
    646                     (int16x8_t *)&q2, (int16x8_t *)&q3);
    647   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    648   mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
    649                               q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
    650   filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
    651           &op1, &op0, &oq0, &oq1, &oq2, bd);
    652   // Note: store_6x8() is faster than transpose + store_8x8().
    653   store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
    654 }
    655 
    656 void vpx_highbd_lpf_vertical_8_dual_neon(
    657     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    658     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    659     const uint8_t *thresh1, int bd) {
    660   vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd);
    661   vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
    662 }
    663 
    664 static void lpf_horizontal_16_kernel(uint16_t *s, int p,
    665                                      const uint16x8_t blimit_vec,
    666                                      const uint16x8_t limit_vec,
    667                                      const uint16x8_t thresh_vec,
    668                                      const int bd) {
    669   uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
    670       q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
    671       oq4, oq5, oq6;
    672   uint32_t flat_status, flat2_status;
    673 
    674   load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
    675             &q3, &q4, &q5, &q6, &q7);
    676   mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
    677                               q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
    678   flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
    679                      &flat2_status, bd);
    680   filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
    681            p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
    682            &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
    683            bd);
    684   store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
    685              oq5, oq6, flat_status, flat2_status);
    686 }
    687 
    688 static void lpf_vertical_16_kernel(uint16_t *s, int p,
    689                                    const uint16x8_t blimit_vec,
    690                                    const uint16x8_t limit_vec,
    691                                    const uint16x8_t thresh_vec, const int bd) {
    692   uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
    693       q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
    694       oq4, oq5, oq6;
    695   uint32_t flat_status, flat2_status;
    696 
    697   load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
    698   transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5,
    699                     (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2,
    700                     (int16x8_t *)&p1, (int16x8_t *)&p0);
    701   load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
    702   transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2,
    703                     (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5,
    704                     (int16x8_t *)&q6, (int16x8_t *)&q7);
    705   mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
    706                               q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
    707   flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
    708                      &flat2_status, bd);
    709   filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
    710            p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
    711            &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
    712            bd);
    713   if (flat_status) {
    714     if (flat2_status) {
    715       store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0);
    716       store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6);
    717     } else {
    718       // Note: store_6x8() is faster than transpose + store_8x8().
    719       store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
    720     }
    721   } else {
    722     store_4x8(s - 2, p, op1, op0, oq0, oq1);
    723   }
    724 }
    725 
    726 void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
    727                                        const uint8_t *blimit,
    728                                        const uint8_t *limit,
    729                                        const uint8_t *thresh, int bd) {
    730   uint16x8_t blimit_vec, limit_vec, thresh_vec;
    731   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    732   lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
    733 }
    734 
    735 void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p,
    736                                             const uint8_t *blimit,
    737                                             const uint8_t *limit,
    738                                             const uint8_t *thresh, int bd) {
    739   uint16x8_t blimit_vec, limit_vec, thresh_vec;
    740   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    741   lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
    742   lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd);
    743 }
    744 
    745 void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit,
    746                                      const uint8_t *limit,
    747                                      const uint8_t *thresh, int bd) {
    748   uint16x8_t blimit_vec, limit_vec, thresh_vec;
    749   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    750   lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
    751 }
    752 
    753 void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p,
    754                                           const uint8_t *blimit,
    755                                           const uint8_t *limit,
    756                                           const uint8_t *thresh, int bd) {
    757   uint16x8_t blimit_vec, limit_vec, thresh_vec;
    758   load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
    759   lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
    760   lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd);
    761 }
    762