1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_config.h" 14 #include "./vpx_dsp_rtcd.h" 15 16 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, 17 uint8x8_t *const s0, uint8x8_t *const s1, 18 uint8x8_t *const s2, uint8x8_t *const s3) { 19 *s0 = vld1_u8(s); 20 s += p; 21 *s1 = vld1_u8(s); 22 s += p; 23 *s2 = vld1_u8(s); 24 s += p; 25 *s3 = vld1_u8(s); 26 } 27 28 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, 29 uint8x8_t *const s0, uint8x8_t *const s1, 30 uint8x8_t *const s2, uint8x8_t *const s3, 31 uint8x8_t *const s4, uint8x8_t *const s5, 32 uint8x8_t *const s6, uint8x8_t *const s7) { 33 *s0 = vld1_u8(s); 34 s += p; 35 *s1 = vld1_u8(s); 36 s += p; 37 *s2 = vld1_u8(s); 38 s += p; 39 *s3 = vld1_u8(s); 40 s += p; 41 *s4 = vld1_u8(s); 42 s += p; 43 *s5 = vld1_u8(s); 44 s += p; 45 *s6 = vld1_u8(s); 46 s += p; 47 *s7 = vld1_u8(s); 48 } 49 50 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, 51 uint8x16_t *const s0, uint8x16_t *const s1, 52 uint8x16_t *const s2, uint8x16_t *const s3, 53 uint8x16_t *const s4, uint8x16_t *const s5, 54 uint8x16_t *const s6, uint8x16_t *const s7) { 55 *s0 = vld1q_u8(s); 56 s += p; 57 *s1 = vld1q_u8(s); 58 s += p; 59 *s2 = vld1q_u8(s); 60 s += p; 61 *s3 = vld1q_u8(s); 62 s += p; 63 *s4 = vld1q_u8(s); 64 s += p; 65 *s5 = vld1q_u8(s); 66 s += p; 67 *s6 = vld1q_u8(s); 68 s += p; 69 *s7 = vld1q_u8(s); 70 } 71 72 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, 73 const int16x4_t s2, const int16x4_t s3, 74 const int16x4_t s4, const int16x4_t s5, 75 const int16x4_t s6, const int16x4_t s7, 76 const int16x8_t filters, 77 const int16x4_t filter3, 78 const int16x4_t filter4) { 79 const int16x4_t filters_lo = vget_low_s16(filters); 80 const int16x4_t filters_hi = vget_high_s16(filters); 81 int16x4_t sum; 82 83 sum = vmul_lane_s16(s0, filters_lo, 0); 84 sum = vmla_lane_s16(sum, s1, filters_lo, 1); 85 sum = vmla_lane_s16(sum, s2, filters_lo, 2); 86 sum = vmla_lane_s16(sum, s5, filters_hi, 1); 87 sum = vmla_lane_s16(sum, s6, filters_hi, 2); 88 sum = vmla_lane_s16(sum, s7, filters_hi, 3); 89 sum = vqadd_s16(sum, vmul_s16(s3, filter3)); 90 sum = vqadd_s16(sum, vmul_s16(s4, filter4)); 91 return sum; 92 } 93 94 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, 95 const int16x8_t s2, const int16x8_t s3, 96 const int16x8_t s4, const int16x8_t s5, 97 const int16x8_t s6, const int16x8_t s7, 98 const int16x8_t filters, 99 const int16x8_t filter3, 100 const int16x8_t filter4) { 101 const int16x4_t filters_lo = vget_low_s16(filters); 102 const int16x4_t filters_hi = vget_high_s16(filters); 103 int16x8_t sum; 104 105 sum = vmulq_lane_s16(s0, filters_lo, 0); 106 sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); 107 sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); 108 sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); 109 sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); 110 sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); 111 sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); 112 sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); 113 return vqrshrun_n_s16(sum, 7); 114 } 115 116 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, 117 const int16x8_t filters) { 118 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 119 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 120 int16x8_t ss[8]; 121 122 ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); 123 ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); 124 ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); 125 ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); 126 ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4])); 127 ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5])); 128 ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6])); 129 ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); 130 131 return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], 132 filters, filter3, filter4); 133 } 134