Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 
     16 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
     17                                uint8x8_t *const s0, uint8x8_t *const s1,
     18                                uint8x8_t *const s2, uint8x8_t *const s3) {
     19   *s0 = vld1_u8(s);
     20   s += p;
     21   *s1 = vld1_u8(s);
     22   s += p;
     23   *s2 = vld1_u8(s);
     24   s += p;
     25   *s3 = vld1_u8(s);
     26 }
     27 
     28 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
     29                                uint8x8_t *const s0, uint8x8_t *const s1,
     30                                uint8x8_t *const s2, uint8x8_t *const s3,
     31                                uint8x8_t *const s4, uint8x8_t *const s5,
     32                                uint8x8_t *const s6, uint8x8_t *const s7) {
     33   *s0 = vld1_u8(s);
     34   s += p;
     35   *s1 = vld1_u8(s);
     36   s += p;
     37   *s2 = vld1_u8(s);
     38   s += p;
     39   *s3 = vld1_u8(s);
     40   s += p;
     41   *s4 = vld1_u8(s);
     42   s += p;
     43   *s5 = vld1_u8(s);
     44   s += p;
     45   *s6 = vld1_u8(s);
     46   s += p;
     47   *s7 = vld1_u8(s);
     48 }
     49 
     50 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
     51                                 uint8x16_t *const s0, uint8x16_t *const s1,
     52                                 uint8x16_t *const s2, uint8x16_t *const s3,
     53                                 uint8x16_t *const s4, uint8x16_t *const s5,
     54                                 uint8x16_t *const s6, uint8x16_t *const s7) {
     55   *s0 = vld1q_u8(s);
     56   s += p;
     57   *s1 = vld1q_u8(s);
     58   s += p;
     59   *s2 = vld1q_u8(s);
     60   s += p;
     61   *s3 = vld1q_u8(s);
     62   s += p;
     63   *s4 = vld1q_u8(s);
     64   s += p;
     65   *s5 = vld1q_u8(s);
     66   s += p;
     67   *s6 = vld1q_u8(s);
     68   s += p;
     69   *s7 = vld1q_u8(s);
     70 }
     71 
     72 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
     73                                     const int16x4_t s2, const int16x4_t s3,
     74                                     const int16x4_t s4, const int16x4_t s5,
     75                                     const int16x4_t s6, const int16x4_t s7,
     76                                     const int16x8_t filters,
     77                                     const int16x4_t filter3,
     78                                     const int16x4_t filter4) {
     79   const int16x4_t filters_lo = vget_low_s16(filters);
     80   const int16x4_t filters_hi = vget_high_s16(filters);
     81   int16x4_t sum;
     82 
     83   sum = vmul_lane_s16(s0, filters_lo, 0);
     84   sum = vmla_lane_s16(sum, s1, filters_lo, 1);
     85   sum = vmla_lane_s16(sum, s2, filters_lo, 2);
     86   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
     87   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
     88   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
     89   sum = vqadd_s16(sum, vmul_s16(s3, filter3));
     90   sum = vqadd_s16(sum, vmul_s16(s4, filter4));
     91   return sum;
     92 }
     93 
     94 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
     95                                     const int16x8_t s2, const int16x8_t s3,
     96                                     const int16x8_t s4, const int16x8_t s5,
     97                                     const int16x8_t s6, const int16x8_t s7,
     98                                     const int16x8_t filters,
     99                                     const int16x8_t filter3,
    100                                     const int16x8_t filter4) {
    101   const int16x4_t filters_lo = vget_low_s16(filters);
    102   const int16x4_t filters_hi = vget_high_s16(filters);
    103   int16x8_t sum;
    104 
    105   sum = vmulq_lane_s16(s0, filters_lo, 0);
    106   sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
    107   sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
    108   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
    109   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
    110   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
    111   sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
    112   sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
    113   return vqrshrun_n_s16(sum, 7);
    114 }
    115 
    116 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
    117                                        const int16x8_t filters) {
    118   const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
    119   const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
    120   int16x8_t ss[8];
    121 
    122   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
    123   ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
    124   ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
    125   ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
    126   ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
    127   ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
    128   ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
    129   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
    130 
    131   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
    132                      filters, filter3, filter4);
    133 }
    134