1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_config.h" 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx/vpx_integer.h" 16 17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, 18 const uint16x8_t vec_hi) { 19 const uint32x4_t vec_l_lo = 20 vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); 21 const uint32x4_t vec_l_hi = 22 vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); 23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); 24 const uint64x2_t b = vpaddlq_u32(a); 25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 26 vreinterpret_u32_u64(vget_high_u64(b))); 27 return vget_lane_u32(c, 0); 28 } 29 30 // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, 31 // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo 32 // and vec_sum_ref_hi. 33 static void sad_neon_64(const uint8x16_t vec_src_00, 34 const uint8x16_t vec_src_16, 35 const uint8x16_t vec_src_32, 36 const uint8x16_t vec_src_48, const uint8_t *ref, 37 uint16x8_t *vec_sum_ref_lo, 38 uint16x8_t *vec_sum_ref_hi) { 39 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 40 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 41 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); 42 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); 43 44 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), 45 vget_low_u8(vec_ref_00)); 46 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), 47 vget_high_u8(vec_ref_00)); 48 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), 49 vget_low_u8(vec_ref_16)); 50 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), 51 vget_high_u8(vec_ref_16)); 52 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), 53 vget_low_u8(vec_ref_32)); 54 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), 55 vget_high_u8(vec_ref_32)); 56 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), 57 vget_low_u8(vec_ref_48)); 58 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), 59 vget_high_u8(vec_ref_48)); 60 } 61 62 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, 63 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. 64 static void sad_neon_32(const uint8x16_t vec_src_00, 65 const uint8x16_t vec_src_16, const uint8_t *ref, 66 uint16x8_t *vec_sum_ref_lo, 67 uint16x8_t *vec_sum_ref_hi) { 68 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 69 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 70 71 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), 72 vget_low_u8(vec_ref_00)); 73 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), 74 vget_high_u8(vec_ref_00)); 75 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), 76 vget_low_u8(vec_ref_16)); 77 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), 78 vget_high_u8(vec_ref_16)); 79 } 80 81 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, 82 const uint8_t *const ref[4], int ref_stride, 83 uint32_t *res) { 84 int i; 85 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); 86 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); 87 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); 88 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); 89 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); 90 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); 91 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); 92 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); 93 const uint8_t *ref0, *ref1, *ref2, *ref3; 94 ref0 = ref[0]; 95 ref1 = ref[1]; 96 ref2 = ref[2]; 97 ref3 = ref[3]; 98 99 for (i = 0; i < 64; ++i) { 100 const uint8x16_t vec_src_00 = vld1q_u8(src); 101 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 102 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); 103 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); 104 105 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, 106 &vec_sum_ref0_lo, &vec_sum_ref0_hi); 107 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, 108 &vec_sum_ref1_lo, &vec_sum_ref1_hi); 109 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, 110 &vec_sum_ref2_lo, &vec_sum_ref2_hi); 111 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, 112 &vec_sum_ref3_lo, &vec_sum_ref3_hi); 113 114 src += src_stride; 115 ref0 += ref_stride; 116 ref1 += ref_stride; 117 ref2 += ref_stride; 118 ref3 += ref_stride; 119 } 120 121 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); 122 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); 123 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); 124 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); 125 } 126 127 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, 128 const uint8_t *const ref[4], int ref_stride, 129 uint32_t *res) { 130 int i; 131 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); 132 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); 133 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); 134 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); 135 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); 136 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); 137 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); 138 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); 139 const uint8_t *ref0, *ref1, *ref2, *ref3; 140 ref0 = ref[0]; 141 ref1 = ref[1]; 142 ref2 = ref[2]; 143 ref3 = ref[3]; 144 145 for (i = 0; i < 32; ++i) { 146 const uint8x16_t vec_src_00 = vld1q_u8(src); 147 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 148 149 sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, 150 &vec_sum_ref0_hi); 151 sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, 152 &vec_sum_ref1_hi); 153 sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, 154 &vec_sum_ref2_hi); 155 sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, 156 &vec_sum_ref3_hi); 157 158 src += src_stride; 159 ref0 += ref_stride; 160 ref1 += ref_stride; 161 ref2 += ref_stride; 162 ref3 += ref_stride; 163 } 164 165 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); 166 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); 167 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); 168 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); 169 } 170 171 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, 172 const uint8_t *const ref[4], int ref_stride, 173 uint32_t *res) { 174 int i; 175 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); 176 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); 177 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); 178 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); 179 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); 180 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); 181 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); 182 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); 183 const uint8_t *ref0, *ref1, *ref2, *ref3; 184 ref0 = ref[0]; 185 ref1 = ref[1]; 186 ref2 = ref[2]; 187 ref3 = ref[3]; 188 189 for (i = 0; i < 16; ++i) { 190 const uint8x16_t vec_src = vld1q_u8(src); 191 const uint8x16_t vec_ref0 = vld1q_u8(ref0); 192 const uint8x16_t vec_ref1 = vld1q_u8(ref1); 193 const uint8x16_t vec_ref2 = vld1q_u8(ref2); 194 const uint8x16_t vec_ref3 = vld1q_u8(ref3); 195 196 vec_sum_ref0_lo = 197 vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); 198 vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), 199 vget_high_u8(vec_ref0)); 200 vec_sum_ref1_lo = 201 vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); 202 vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), 203 vget_high_u8(vec_ref1)); 204 vec_sum_ref2_lo = 205 vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); 206 vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), 207 vget_high_u8(vec_ref2)); 208 vec_sum_ref3_lo = 209 vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); 210 vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), 211 vget_high_u8(vec_ref3)); 212 213 src += src_stride; 214 ref0 += ref_stride; 215 ref1 += ref_stride; 216 ref2 += ref_stride; 217 ref3 += ref_stride; 218 } 219 220 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); 221 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); 222 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); 223 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); 224 } 225