1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_config.h" 14 #include "./vpx_dsp_rtcd.h" 15 16 #include "vpx/vpx_integer.h" 17 #include "vpx_dsp/arm/mem_neon.h" 18 #include "vpx_dsp/arm/sum_neon.h" 19 20 uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, 21 const uint8_t *ref_ptr, int ref_stride) { 22 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 23 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 24 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); 25 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); 26 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 27 } 28 29 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, 30 const uint8_t *ref_ptr, int ref_stride, 31 const uint8_t *second_pred) { 32 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 33 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 34 const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); 35 const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); 36 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); 37 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); 38 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 39 } 40 41 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, 42 const uint8_t *ref_ptr, int ref_stride) { 43 int i; 44 uint16x8_t abs = vdupq_n_u16(0); 45 for (i = 0; i < 8; i += 4) { 46 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 47 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 48 src_ptr += 4 * src_stride; 49 ref_ptr += 4 * ref_stride; 50 abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); 51 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); 52 } 53 54 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 55 } 56 57 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, 58 const uint8_t *ref_ptr, int ref_stride, 59 const uint8_t *second_pred) { 60 int i; 61 uint16x8_t abs = vdupq_n_u16(0); 62 for (i = 0; i < 8; i += 4) { 63 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 64 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 65 const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); 66 const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); 67 src_ptr += 4 * src_stride; 68 ref_ptr += 4 * ref_stride; 69 second_pred += 16; 70 abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); 71 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); 72 } 73 74 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 75 } 76 77 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, 78 const uint8_t *ref_ptr, int ref_stride, 79 const int height) { 80 int i; 81 uint16x8_t abs = vdupq_n_u16(0); 82 83 for (i = 0; i < height; ++i) { 84 const uint8x8_t a_u8 = vld1_u8(src_ptr); 85 const uint8x8_t b_u8 = vld1_u8(ref_ptr); 86 src_ptr += src_stride; 87 ref_ptr += ref_stride; 88 abs = vabal_u8(abs, a_u8, b_u8); 89 } 90 return abs; 91 } 92 93 static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, 94 const uint8_t *ref_ptr, int ref_stride, 95 const uint8_t *second_pred, 96 const int height) { 97 int i; 98 uint16x8_t abs = vdupq_n_u16(0); 99 100 for (i = 0; i < height; ++i) { 101 const uint8x8_t a_u8 = vld1_u8(src_ptr); 102 const uint8x8_t b_u8 = vld1_u8(ref_ptr); 103 const uint8x8_t c_u8 = vld1_u8(second_pred); 104 const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); 105 src_ptr += src_stride; 106 ref_ptr += ref_stride; 107 second_pred += 8; 108 abs = vabal_u8(abs, a_u8, avg); 109 } 110 return abs; 111 } 112 113 #define sad8xN(n) \ 114 uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ 115 const uint8_t *ref_ptr, int ref_stride) { \ 116 const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ 117 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 118 } \ 119 \ 120 uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ 121 const uint8_t *ref_ptr, int ref_stride, \ 122 const uint8_t *second_pred) { \ 123 const uint16x8_t abs = \ 124 sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ 125 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 126 } 127 128 sad8xN(4); 129 sad8xN(8); 130 sad8xN(16); 131 132 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, 133 const uint8_t *ref_ptr, int ref_stride, 134 const int height) { 135 int i; 136 uint16x8_t abs = vdupq_n_u16(0); 137 138 for (i = 0; i < height; ++i) { 139 const uint8x16_t a_u8 = vld1q_u8(src_ptr); 140 const uint8x16_t b_u8 = vld1q_u8(ref_ptr); 141 src_ptr += src_stride; 142 ref_ptr += ref_stride; 143 abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); 144 abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); 145 } 146 return abs; 147 } 148 149 static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, 150 const uint8_t *ref_ptr, int ref_stride, 151 const uint8_t *second_pred, 152 const int height) { 153 int i; 154 uint16x8_t abs = vdupq_n_u16(0); 155 156 for (i = 0; i < height; ++i) { 157 const uint8x16_t a_u8 = vld1q_u8(src_ptr); 158 const uint8x16_t b_u8 = vld1q_u8(ref_ptr); 159 const uint8x16_t c_u8 = vld1q_u8(second_pred); 160 const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); 161 src_ptr += src_stride; 162 ref_ptr += ref_stride; 163 second_pred += 16; 164 abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); 165 abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); 166 } 167 return abs; 168 } 169 170 #define sad16xN(n) \ 171 uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ 172 const uint8_t *ref_ptr, int ref_stride) { \ 173 const uint16x8_t abs = \ 174 sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ 175 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 176 } \ 177 \ 178 uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ 179 const uint8_t *ref_ptr, int ref_stride, \ 180 const uint8_t *second_pred) { \ 181 const uint16x8_t abs = \ 182 sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ 183 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 184 } 185 186 sad16xN(8); 187 sad16xN(16); 188 sad16xN(32); 189 190 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, 191 const uint8_t *ref_ptr, int ref_stride, 192 const int height) { 193 int i; 194 uint16x8_t abs = vdupq_n_u16(0); 195 196 for (i = 0; i < height; ++i) { 197 const uint8x16_t a_lo = vld1q_u8(src_ptr); 198 const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); 199 const uint8x16_t b_lo = vld1q_u8(ref_ptr); 200 const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); 201 src_ptr += src_stride; 202 ref_ptr += ref_stride; 203 abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); 204 abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); 205 abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); 206 abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); 207 } 208 return abs; 209 } 210 211 static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, 212 const uint8_t *ref_ptr, int ref_stride, 213 const uint8_t *second_pred, 214 const int height) { 215 int i; 216 uint16x8_t abs = vdupq_n_u16(0); 217 218 for (i = 0; i < height; ++i) { 219 const uint8x16_t a_lo = vld1q_u8(src_ptr); 220 const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); 221 const uint8x16_t b_lo = vld1q_u8(ref_ptr); 222 const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); 223 const uint8x16_t c_lo = vld1q_u8(second_pred); 224 const uint8x16_t c_hi = vld1q_u8(second_pred + 16); 225 const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); 226 const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); 227 src_ptr += src_stride; 228 ref_ptr += ref_stride; 229 second_pred += 32; 230 abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); 231 abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); 232 abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); 233 abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); 234 } 235 return abs; 236 } 237 238 #define sad32xN(n) \ 239 uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ 240 const uint8_t *ref_ptr, int ref_stride) { \ 241 const uint16x8_t abs = \ 242 sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ 243 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 244 } \ 245 \ 246 uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ 247 const uint8_t *ref_ptr, int ref_stride, \ 248 const uint8_t *second_pred) { \ 249 const uint16x8_t abs = \ 250 sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ 251 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 252 } 253 254 sad32xN(16); 255 sad32xN(32); 256 sad32xN(64); 257 258 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, 259 const uint8_t *ref_ptr, int ref_stride, 260 const int height) { 261 int i; 262 uint16x8_t abs_0 = vdupq_n_u16(0); 263 uint16x8_t abs_1 = vdupq_n_u16(0); 264 265 for (i = 0; i < height; ++i) { 266 const uint8x16_t a_0 = vld1q_u8(src_ptr); 267 const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); 268 const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); 269 const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); 270 const uint8x16_t b_0 = vld1q_u8(ref_ptr); 271 const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); 272 const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); 273 const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); 274 src_ptr += src_stride; 275 ref_ptr += ref_stride; 276 abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); 277 abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); 278 abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); 279 abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); 280 abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); 281 abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); 282 abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); 283 abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); 284 } 285 286 { 287 const uint32x4_t sum = vpaddlq_u16(abs_0); 288 return vpadalq_u16(sum, abs_1); 289 } 290 } 291 292 static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, 293 const uint8_t *ref_ptr, int ref_stride, 294 const uint8_t *second_pred, 295 const int height) { 296 int i; 297 uint16x8_t abs_0 = vdupq_n_u16(0); 298 uint16x8_t abs_1 = vdupq_n_u16(0); 299 300 for (i = 0; i < height; ++i) { 301 const uint8x16_t a_0 = vld1q_u8(src_ptr); 302 const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); 303 const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); 304 const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); 305 const uint8x16_t b_0 = vld1q_u8(ref_ptr); 306 const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); 307 const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); 308 const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); 309 const uint8x16_t c_0 = vld1q_u8(second_pred); 310 const uint8x16_t c_1 = vld1q_u8(second_pred + 16); 311 const uint8x16_t c_2 = vld1q_u8(second_pred + 32); 312 const uint8x16_t c_3 = vld1q_u8(second_pred + 48); 313 const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); 314 const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); 315 const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); 316 const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); 317 src_ptr += src_stride; 318 ref_ptr += ref_stride; 319 second_pred += 64; 320 abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); 321 abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); 322 abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); 323 abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); 324 abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); 325 abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); 326 abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); 327 abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); 328 } 329 330 { 331 const uint32x4_t sum = vpaddlq_u16(abs_0); 332 return vpadalq_u16(sum, abs_1); 333 } 334 } 335 336 #define sad64xN(n) \ 337 uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ 338 const uint8_t *ref_ptr, int ref_stride) { \ 339 const uint32x4_t abs = \ 340 sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ 341 return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ 342 } \ 343 \ 344 uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ 345 const uint8_t *ref_ptr, int ref_stride, \ 346 const uint8_t *second_pred) { \ 347 const uint32x4_t abs = \ 348 sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ 349 return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ 350 } 351 352 sad64xN(32); 353 sad64xN(64); 354