Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 
     16 #include "vpx/vpx_integer.h"
     17 #include "vpx_dsp/arm/mem_neon.h"
     18 #include "vpx_dsp/arm/sum_neon.h"
     19 
     20 uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
     21                          const uint8_t *ref_ptr, int ref_stride) {
     22   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
     23   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
     24   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
     25   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
     26   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
     27 }
     28 
     29 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
     30                              const uint8_t *ref_ptr, int ref_stride,
     31                              const uint8_t *second_pred) {
     32   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
     33   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
     34   const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
     35   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
     36   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
     37   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
     38   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
     39 }
     40 
     41 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
     42                          const uint8_t *ref_ptr, int ref_stride) {
     43   int i;
     44   uint16x8_t abs = vdupq_n_u16(0);
     45   for (i = 0; i < 8; i += 4) {
     46     const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
     47     const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
     48     src_ptr += 4 * src_stride;
     49     ref_ptr += 4 * ref_stride;
     50     abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
     51     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
     52   }
     53 
     54   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
     55 }
     56 
     57 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
     58                              const uint8_t *ref_ptr, int ref_stride,
     59                              const uint8_t *second_pred) {
     60   int i;
     61   uint16x8_t abs = vdupq_n_u16(0);
     62   for (i = 0; i < 8; i += 4) {
     63     const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
     64     const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
     65     const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
     66     const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
     67     src_ptr += 4 * src_stride;
     68     ref_ptr += 4 * ref_stride;
     69     second_pred += 16;
     70     abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
     71     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
     72   }
     73 
     74   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
     75 }
     76 
     77 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
     78                                const uint8_t *ref_ptr, int ref_stride,
     79                                const int height) {
     80   int i;
     81   uint16x8_t abs = vdupq_n_u16(0);
     82 
     83   for (i = 0; i < height; ++i) {
     84     const uint8x8_t a_u8 = vld1_u8(src_ptr);
     85     const uint8x8_t b_u8 = vld1_u8(ref_ptr);
     86     src_ptr += src_stride;
     87     ref_ptr += ref_stride;
     88     abs = vabal_u8(abs, a_u8, b_u8);
     89   }
     90   return abs;
     91 }
     92 
     93 static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
     94                                    const uint8_t *ref_ptr, int ref_stride,
     95                                    const uint8_t *second_pred,
     96                                    const int height) {
     97   int i;
     98   uint16x8_t abs = vdupq_n_u16(0);
     99 
    100   for (i = 0; i < height; ++i) {
    101     const uint8x8_t a_u8 = vld1_u8(src_ptr);
    102     const uint8x8_t b_u8 = vld1_u8(ref_ptr);
    103     const uint8x8_t c_u8 = vld1_u8(second_pred);
    104     const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
    105     src_ptr += src_stride;
    106     ref_ptr += ref_stride;
    107     second_pred += 8;
    108     abs = vabal_u8(abs, a_u8, avg);
    109   }
    110   return abs;
    111 }
    112 
    113 #define sad8xN(n)                                                              \
    114   uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
    115                                const uint8_t *ref_ptr, int ref_stride) {       \
    116     const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
    117     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
    118   }                                                                            \
    119                                                                                \
    120   uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
    121                                    const uint8_t *ref_ptr, int ref_stride,     \
    122                                    const uint8_t *second_pred) {               \
    123     const uint16x8_t abs =                                                     \
    124         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
    125     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
    126   }
    127 
    128 sad8xN(4);
    129 sad8xN(8);
    130 sad8xN(16);
    131 
    132 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
    133                                 const uint8_t *ref_ptr, int ref_stride,
    134                                 const int height) {
    135   int i;
    136   uint16x8_t abs = vdupq_n_u16(0);
    137 
    138   for (i = 0; i < height; ++i) {
    139     const uint8x16_t a_u8 = vld1q_u8(src_ptr);
    140     const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
    141     src_ptr += src_stride;
    142     ref_ptr += ref_stride;
    143     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
    144     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
    145   }
    146   return abs;
    147 }
    148 
    149 static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
    150                                     const uint8_t *ref_ptr, int ref_stride,
    151                                     const uint8_t *second_pred,
    152                                     const int height) {
    153   int i;
    154   uint16x8_t abs = vdupq_n_u16(0);
    155 
    156   for (i = 0; i < height; ++i) {
    157     const uint8x16_t a_u8 = vld1q_u8(src_ptr);
    158     const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
    159     const uint8x16_t c_u8 = vld1q_u8(second_pred);
    160     const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
    161     src_ptr += src_stride;
    162     ref_ptr += ref_stride;
    163     second_pred += 16;
    164     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
    165     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
    166   }
    167   return abs;
    168 }
    169 
    170 #define sad16xN(n)                                                            \
    171   uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
    172                                 const uint8_t *ref_ptr, int ref_stride) {     \
    173     const uint16x8_t abs =                                                    \
    174         sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
    175     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
    176   }                                                                           \
    177                                                                               \
    178   uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
    179                                     const uint8_t *ref_ptr, int ref_stride,   \
    180                                     const uint8_t *second_pred) {             \
    181     const uint16x8_t abs =                                                    \
    182         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
    183     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
    184   }
    185 
    186 sad16xN(8);
    187 sad16xN(16);
    188 sad16xN(32);
    189 
    190 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
    191                                 const uint8_t *ref_ptr, int ref_stride,
    192                                 const int height) {
    193   int i;
    194   uint16x8_t abs = vdupq_n_u16(0);
    195 
    196   for (i = 0; i < height; ++i) {
    197     const uint8x16_t a_lo = vld1q_u8(src_ptr);
    198     const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
    199     const uint8x16_t b_lo = vld1q_u8(ref_ptr);
    200     const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
    201     src_ptr += src_stride;
    202     ref_ptr += ref_stride;
    203     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
    204     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
    205     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
    206     abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
    207   }
    208   return abs;
    209 }
    210 
    211 static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
    212                                     const uint8_t *ref_ptr, int ref_stride,
    213                                     const uint8_t *second_pred,
    214                                     const int height) {
    215   int i;
    216   uint16x8_t abs = vdupq_n_u16(0);
    217 
    218   for (i = 0; i < height; ++i) {
    219     const uint8x16_t a_lo = vld1q_u8(src_ptr);
    220     const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
    221     const uint8x16_t b_lo = vld1q_u8(ref_ptr);
    222     const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
    223     const uint8x16_t c_lo = vld1q_u8(second_pred);
    224     const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
    225     const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
    226     const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
    227     src_ptr += src_stride;
    228     ref_ptr += ref_stride;
    229     second_pred += 32;
    230     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
    231     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
    232     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
    233     abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
    234   }
    235   return abs;
    236 }
    237 
    238 #define sad32xN(n)                                                            \
    239   uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
    240                                 const uint8_t *ref_ptr, int ref_stride) {     \
    241     const uint16x8_t abs =                                                    \
    242         sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
    243     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
    244   }                                                                           \
    245                                                                               \
    246   uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
    247                                     const uint8_t *ref_ptr, int ref_stride,   \
    248                                     const uint8_t *second_pred) {             \
    249     const uint16x8_t abs =                                                    \
    250         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
    251     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
    252   }
    253 
    254 sad32xN(16);
    255 sad32xN(32);
    256 sad32xN(64);
    257 
    258 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
    259                                 const uint8_t *ref_ptr, int ref_stride,
    260                                 const int height) {
    261   int i;
    262   uint16x8_t abs_0 = vdupq_n_u16(0);
    263   uint16x8_t abs_1 = vdupq_n_u16(0);
    264 
    265   for (i = 0; i < height; ++i) {
    266     const uint8x16_t a_0 = vld1q_u8(src_ptr);
    267     const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
    268     const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
    269     const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
    270     const uint8x16_t b_0 = vld1q_u8(ref_ptr);
    271     const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
    272     const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
    273     const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
    274     src_ptr += src_stride;
    275     ref_ptr += ref_stride;
    276     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
    277     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
    278     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
    279     abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
    280     abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
    281     abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
    282     abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
    283     abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
    284   }
    285 
    286   {
    287     const uint32x4_t sum = vpaddlq_u16(abs_0);
    288     return vpadalq_u16(sum, abs_1);
    289   }
    290 }
    291 
    292 static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
    293                                     const uint8_t *ref_ptr, int ref_stride,
    294                                     const uint8_t *second_pred,
    295                                     const int height) {
    296   int i;
    297   uint16x8_t abs_0 = vdupq_n_u16(0);
    298   uint16x8_t abs_1 = vdupq_n_u16(0);
    299 
    300   for (i = 0; i < height; ++i) {
    301     const uint8x16_t a_0 = vld1q_u8(src_ptr);
    302     const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
    303     const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
    304     const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
    305     const uint8x16_t b_0 = vld1q_u8(ref_ptr);
    306     const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
    307     const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
    308     const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
    309     const uint8x16_t c_0 = vld1q_u8(second_pred);
    310     const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
    311     const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
    312     const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
    313     const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
    314     const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
    315     const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
    316     const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
    317     src_ptr += src_stride;
    318     ref_ptr += ref_stride;
    319     second_pred += 64;
    320     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
    321     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
    322     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
    323     abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
    324     abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
    325     abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
    326     abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
    327     abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
    328   }
    329 
    330   {
    331     const uint32x4_t sum = vpaddlq_u16(abs_0);
    332     return vpadalq_u16(sum, abs_1);
    333   }
    334 }
    335 
    336 #define sad64xN(n)                                                            \
    337   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
    338                                 const uint8_t *ref_ptr, int ref_stride) {     \
    339     const uint32x4_t abs =                                                    \
    340         sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
    341     return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
    342   }                                                                           \
    343                                                                               \
    344   uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
    345                                     const uint8_t *ref_ptr, int ref_stride,   \
    346                                     const uint8_t *second_pred) {             \
    347     const uint32x4_t abs =                                                    \
    348         sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
    349     return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
    350   }
    351 
    352 sad64xN(32);
    353 sad64xN(64);
    354