1 /* 2 * 3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <assert.h> 14 #include <arm_neon.h> 15 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_ports/mem.h" 20 #include "av1/common/convolve.h" 21 #include "av1/common/filter.h" 22 #include "av1/common/arm/convolve_neon.h" 23 #include "av1/common/arm/mem_neon.h" 24 #include "av1/common/arm/transpose_neon.h" 25 26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1, 27 const int16x4_t s2, const int16x4_t s3, 28 const int16x4_t s4, const int16x4_t s5, 29 const int16x4_t s6, const int16x4_t s7, 30 const int16_t *filter) { 31 int16x4_t sum; 32 33 sum = vmul_n_s16(s0, filter[0]); 34 sum = vmla_n_s16(sum, s1, filter[1]); 35 sum = vmla_n_s16(sum, s2, filter[2]); 36 sum = vmla_n_s16(sum, s5, filter[5]); 37 sum = vmla_n_s16(sum, s6, filter[6]); 38 sum = vmla_n_s16(sum, s7, filter[7]); 39 /* filter[3] can take a max value of 128. So the max value of the result : 40 * 128*255 + sum > 16 bits 41 */ 42 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3])); 43 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4])); 44 45 return sum; 46 } 47 48 static INLINE uint8x8_t convolve8_horiz_8x8( 49 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 50 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 51 const int16x8_t s6, const int16x8_t s7, const int16_t *filter, 52 const int16x8_t shift_round_0, const int16x8_t shift_by_bits) { 53 int16x8_t sum; 54 55 sum = vmulq_n_s16(s0, filter[0]); 56 sum = vmlaq_n_s16(sum, s1, filter[1]); 57 sum = vmlaq_n_s16(sum, s2, filter[2]); 58 sum = vmlaq_n_s16(sum, s5, filter[5]); 59 sum = vmlaq_n_s16(sum, s6, filter[6]); 60 sum = vmlaq_n_s16(sum, s7, filter[7]); 61 /* filter[3] can take a max value of 128. So the max value of the result : 62 * 128*255 + sum > 16 bits 63 */ 64 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3])); 65 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4])); 66 67 sum = vqrshlq_s16(sum, shift_round_0); 68 sum = vqrshlq_s16(sum, shift_by_bits); 69 70 return vqmovun_s16(sum); 71 } 72 73 #if !defined(__aarch64__) 74 static INLINE uint8x8_t convolve8_horiz_4x1( 75 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 76 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 77 const int16x4_t s6, const int16x4_t s7, const int16_t *filter, 78 const int16x4_t shift_round_0, const int16x4_t shift_by_bits) { 79 int16x4_t sum; 80 81 sum = vmul_n_s16(s0, filter[0]); 82 sum = vmla_n_s16(sum, s1, filter[1]); 83 sum = vmla_n_s16(sum, s2, filter[2]); 84 sum = vmla_n_s16(sum, s5, filter[5]); 85 sum = vmla_n_s16(sum, s6, filter[6]); 86 sum = vmla_n_s16(sum, s7, filter[7]); 87 /* filter[3] can take a max value of 128. So the max value of the result : 88 * 128*255 + sum > 16 bits 89 */ 90 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3])); 91 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4])); 92 93 sum = vqrshl_s16(sum, shift_round_0); 94 sum = vqrshl_s16(sum, shift_by_bits); 95 96 return vqmovun_s16(vcombine_s16(sum, sum)); 97 } 98 #endif // !defined(__arch64__) 99 100 static INLINE uint8x8_t convolve8_vert_8x4( 101 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 102 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 103 const int16x8_t s6, const int16x8_t s7, const int16_t *filter) { 104 int16x8_t sum; 105 106 sum = vmulq_n_s16(s0, filter[0]); 107 sum = vmlaq_n_s16(sum, s1, filter[1]); 108 sum = vmlaq_n_s16(sum, s2, filter[2]); 109 sum = vmlaq_n_s16(sum, s5, filter[5]); 110 sum = vmlaq_n_s16(sum, s6, filter[6]); 111 sum = vmlaq_n_s16(sum, s7, filter[7]); 112 /* filter[3] can take a max value of 128. So the max value of the result : 113 * 128*255 + sum > 16 bits 114 */ 115 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3])); 116 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4])); 117 118 return vqrshrun_n_s16(sum, FILTER_BITS); 119 } 120 121 static INLINE uint16x4_t convolve8_vert_4x4_s32( 122 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 123 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 124 const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter, 125 const int32x4_t round_shift_vec, const int32x4_t offset_const, 126 const int32x4_t sub_const_vec) { 127 int32x4_t sum0; 128 uint16x4_t res; 129 const int32x4_t zero = vdupq_n_s32(0); 130 131 sum0 = vmull_n_s16(s0, y_filter[0]); 132 sum0 = vmlal_n_s16(sum0, s1, y_filter[1]); 133 sum0 = vmlal_n_s16(sum0, s2, y_filter[2]); 134 sum0 = vmlal_n_s16(sum0, s3, y_filter[3]); 135 sum0 = vmlal_n_s16(sum0, s4, y_filter[4]); 136 sum0 = vmlal_n_s16(sum0, s5, y_filter[5]); 137 sum0 = vmlal_n_s16(sum0, s6, y_filter[6]); 138 sum0 = vmlal_n_s16(sum0, s7, y_filter[7]); 139 140 sum0 = vaddq_s32(sum0, offset_const); 141 sum0 = vqrshlq_s32(sum0, round_shift_vec); 142 sum0 = vsubq_s32(sum0, sub_const_vec); 143 sum0 = vmaxq_s32(sum0, zero); 144 145 res = vmovn_u32(vreinterpretq_u32_s32(sum0)); 146 147 return res; 148 } 149 150 static INLINE uint8x8_t convolve8_vert_8x4_s32( 151 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 152 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 153 const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter, 154 const int32x4_t round_shift_vec, const int32x4_t offset_const, 155 const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) { 156 int32x4_t sum0, sum1; 157 uint16x8_t res; 158 const int32x4_t zero = vdupq_n_s32(0); 159 160 sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]); 161 sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]); 162 sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]); 163 sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]); 164 sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]); 165 sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]); 166 sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]); 167 sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]); 168 169 sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]); 170 sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]); 171 sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]); 172 sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]); 173 sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]); 174 sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]); 175 sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]); 176 sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]); 177 178 sum0 = vaddq_s32(sum0, offset_const); 179 sum1 = vaddq_s32(sum1, offset_const); 180 sum0 = vqrshlq_s32(sum0, round_shift_vec); 181 sum1 = vqrshlq_s32(sum1, round_shift_vec); 182 sum0 = vsubq_s32(sum0, sub_const_vec); 183 sum1 = vsubq_s32(sum1, sub_const_vec); 184 sum0 = vmaxq_s32(sum0, zero); 185 sum1 = vmaxq_s32(sum1, zero); 186 res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)), 187 vqmovn_u32(vreinterpretq_u32_s32(sum1))); 188 189 res = vqrshlq_u16(res, vec_round_bits); 190 191 return vqmovn_u16(res); 192 } 193 194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, 195 int dst_stride, int w, int h, 196 const InterpFilterParams *filter_params_x, 197 const InterpFilterParams *filter_params_y, 198 const int subpel_x_q4, const int subpel_y_q4, 199 ConvolveParams *conv_params) { 200 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; 201 const int8_t bits = FILTER_BITS - conv_params->round_0; 202 203 (void)subpel_y_q4; 204 (void)conv_params; 205 (void)filter_params_y; 206 207 uint8x8_t t0; 208 #if defined(__aarch64__) 209 uint8x8_t t1, t2, t3; 210 #endif 211 212 assert(bits >= 0); 213 assert((FILTER_BITS - conv_params->round_1) >= 0 || 214 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 215 216 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 217 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 218 219 const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0); 220 const int16x8_t shift_by_bits = vdupq_n_s16(-bits); 221 222 src -= horiz_offset; 223 #if defined(__aarch64__) 224 if (h == 4) { 225 uint8x8_t d01, d23; 226 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 227 int16x8_t d01_temp, d23_temp; 228 229 __builtin_prefetch(src + 0 * src_stride); 230 __builtin_prefetch(src + 1 * src_stride); 231 __builtin_prefetch(src + 2 * src_stride); 232 __builtin_prefetch(src + 3 * src_stride); 233 234 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 235 transpose_u8_8x4(&t0, &t1, &t2, &t3); 236 237 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 238 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 239 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 240 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 241 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 242 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 243 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 244 __builtin_prefetch(dst + 0 * dst_stride); 245 __builtin_prefetch(dst + 1 * dst_stride); 246 __builtin_prefetch(dst + 2 * dst_stride); 247 __builtin_prefetch(dst + 3 * dst_stride); 248 src += 7; 249 250 do { 251 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 252 transpose_u8_8x4(&t0, &t1, &t2, &t3); 253 254 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 255 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 256 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 257 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 258 259 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter); 260 261 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter); 262 263 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter); 264 265 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter); 266 267 d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0); 268 d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0); 269 270 d01_temp = vqrshlq_s16(d01_temp, shift_by_bits); 271 d23_temp = vqrshlq_s16(d23_temp, shift_by_bits); 272 273 d01 = vqmovun_s16(d01_temp); 274 d23 = vqmovun_s16(d23_temp); 275 276 transpose_u8_4x4(&d01, &d23); 277 278 if (w != 2) { 279 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), // 00 01 02 03 280 vreinterpret_u32_u8(d01), 0); 281 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), // 10 11 12 13 282 vreinterpret_u32_u8(d23), 0); 283 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), // 20 21 22 23 284 vreinterpret_u32_u8(d01), 1); 285 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), // 30 31 32 33 286 vreinterpret_u32_u8(d23), 1); 287 } else { 288 vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride), // 00 01 289 vreinterpret_u16_u8(d01), 0); 290 vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride), // 10 11 291 vreinterpret_u16_u8(d23), 0); 292 vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride), // 20 21 293 vreinterpret_u16_u8(d01), 2); 294 vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride), // 30 31 295 vreinterpret_u16_u8(d23), 2); 296 } 297 298 s0 = s4; 299 s1 = s5; 300 s2 = s6; 301 s3 = s7; 302 s4 = s8; 303 s5 = s9; 304 s6 = s10; 305 src += 4; 306 dst += 4; 307 w -= 4; 308 } while (w > 0); 309 } else { 310 #endif 311 int width; 312 const uint8_t *s; 313 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 314 315 #if defined(__aarch64__) 316 int16x8_t s8, s9, s10; 317 uint8x8_t t4, t5, t6, t7; 318 #endif 319 320 if (w <= 4) { 321 #if defined(__aarch64__) 322 do { 323 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 324 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 325 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 326 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 327 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 328 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 329 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 330 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 331 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 332 333 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, 334 &t7); 335 src += 8 * src_stride; 336 __builtin_prefetch(dst + 0 * dst_stride); 337 __builtin_prefetch(dst + 1 * dst_stride); 338 __builtin_prefetch(dst + 2 * dst_stride); 339 __builtin_prefetch(dst + 3 * dst_stride); 340 __builtin_prefetch(dst + 4 * dst_stride); 341 __builtin_prefetch(dst + 5 * dst_stride); 342 __builtin_prefetch(dst + 6 * dst_stride); 343 __builtin_prefetch(dst + 7 * dst_stride); 344 345 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); 346 347 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 348 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 349 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 350 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 351 352 __builtin_prefetch(src + 0 * src_stride); 353 __builtin_prefetch(src + 1 * src_stride); 354 __builtin_prefetch(src + 2 * src_stride); 355 __builtin_prefetch(src + 3 * src_stride); 356 __builtin_prefetch(src + 4 * src_stride); 357 __builtin_prefetch(src + 5 * src_stride); 358 __builtin_prefetch(src + 6 * src_stride); 359 __builtin_prefetch(src + 7 * src_stride); 360 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 361 shift_round_0, shift_by_bits); 362 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, 363 shift_round_0, shift_by_bits); 364 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, 365 shift_round_0, shift_by_bits); 366 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, 367 shift_round_0, shift_by_bits); 368 369 transpose_u8_8x4(&t0, &t1, &t2, &t3); 370 371 if ((w == 4) && (h > 4)) { 372 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 373 0); // 00 01 02 03 374 dst += dst_stride; 375 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 376 0); // 10 11 12 13 377 dst += dst_stride; 378 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 379 0); // 20 21 22 23 380 dst += dst_stride; 381 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 382 0); // 30 31 32 33 383 dst += dst_stride; 384 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 385 1); // 40 41 42 43 386 dst += dst_stride; 387 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 388 1); // 50 51 52 53 389 dst += dst_stride; 390 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 391 1); // 60 61 62 63 392 dst += dst_stride; 393 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 394 1); // 70 71 72 73 395 dst += dst_stride; 396 } else if ((w == 4) && (h == 2)) { 397 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 398 0); // 00 01 02 03 399 dst += dst_stride; 400 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 401 0); // 10 11 12 13 402 dst += dst_stride; 403 } else if ((w == 2) && (h > 4)) { 404 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 405 dst += dst_stride; 406 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11 407 dst += dst_stride; 408 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0); // 20 21 409 dst += dst_stride; 410 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0); // 30 31 411 dst += dst_stride; 412 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2); // 40 41 413 dst += dst_stride; 414 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2); // 50 51 415 dst += dst_stride; 416 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2); // 60 61 417 dst += dst_stride; 418 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2); // 70 71 419 dst += dst_stride; 420 } else if ((w == 2) && (h == 2)) { 421 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 422 dst += dst_stride; 423 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11 424 dst += dst_stride; 425 } 426 h -= 8; 427 } while (h > 0); 428 #else 429 int16x8_t tt0; 430 int16x4_t x0, x1, x2, x3, x4, x5, x6, x7; 431 const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0); 432 const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits); 433 do { 434 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 435 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 436 x0 = vget_low_s16(tt0); // a0 a1 a2 a3 437 x4 = vget_high_s16(tt0); // a4 a5 a6 a7 438 439 t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15 440 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 441 x7 = vget_low_s16(tt0); // a8 a9 a10 a11 442 443 x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4 444 x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5 445 x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6 446 x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8 447 x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9 448 x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10 449 450 src += src_stride; 451 452 t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter, 453 shift_round_0_low, shift_by_bits_low); 454 455 if (w == 4) { 456 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 457 0); // 00 01 02 03 458 dst += dst_stride; 459 } else if (w == 2) { 460 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 461 dst += dst_stride; 462 } 463 h -= 1; 464 } while (h > 0); 465 #endif 466 } else { 467 uint8_t *d; 468 int16x8_t s11; 469 #if defined(__aarch64__) 470 int16x8_t s12, s13, s14; 471 do { 472 __builtin_prefetch(src + 0 * src_stride); 473 __builtin_prefetch(src + 1 * src_stride); 474 __builtin_prefetch(src + 2 * src_stride); 475 __builtin_prefetch(src + 3 * src_stride); 476 __builtin_prefetch(src + 4 * src_stride); 477 __builtin_prefetch(src + 5 * src_stride); 478 __builtin_prefetch(src + 6 * src_stride); 479 __builtin_prefetch(src + 7 * src_stride); 480 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 481 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 482 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 483 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 484 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 485 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 486 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 487 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 488 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 489 490 width = w; 491 s = src + 7; 492 d = dst; 493 __builtin_prefetch(dst + 0 * dst_stride); 494 __builtin_prefetch(dst + 1 * dst_stride); 495 __builtin_prefetch(dst + 2 * dst_stride); 496 __builtin_prefetch(dst + 3 * dst_stride); 497 __builtin_prefetch(dst + 4 * dst_stride); 498 __builtin_prefetch(dst + 5 * dst_stride); 499 __builtin_prefetch(dst + 6 * dst_stride); 500 __builtin_prefetch(dst + 7 * dst_stride); 501 502 do { 503 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 504 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 505 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 506 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 507 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 508 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 509 s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 510 s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 511 s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 512 s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 513 514 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 515 shift_round_0, shift_by_bits); 516 517 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, 518 shift_round_0, shift_by_bits); 519 520 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, 521 shift_round_0, shift_by_bits); 522 523 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, 524 shift_round_0, shift_by_bits); 525 526 t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, 527 shift_round_0, shift_by_bits); 528 529 t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, 530 shift_round_0, shift_by_bits); 531 532 t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, 533 shift_round_0, shift_by_bits); 534 535 t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14, 536 x_filter, shift_round_0, shift_by_bits); 537 538 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 539 if (h != 2) { 540 store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); 541 } else { 542 store_row2_u8_8x8(d, dst_stride, t0, t1); 543 } 544 s0 = s8; 545 s1 = s9; 546 s2 = s10; 547 s3 = s11; 548 s4 = s12; 549 s5 = s13; 550 s6 = s14; 551 s += 8; 552 d += 8; 553 width -= 8; 554 } while (width > 0); 555 src += 8 * src_stride; 556 dst += 8 * dst_stride; 557 h -= 8; 558 } while (h > 0); 559 #else 560 do { 561 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 562 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 563 564 width = w; 565 s = src + 8; 566 d = dst; 567 __builtin_prefetch(dst); 568 569 do { 570 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 571 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 572 s11 = s0; 573 s0 = s7; 574 575 s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 576 s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 577 s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 578 s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 579 s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 580 s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 581 s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 582 583 t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter, 584 shift_round_0, shift_by_bits); 585 vst1_u8(d, t0); 586 587 s += 8; 588 d += 8; 589 width -= 8; 590 } while (width > 0); 591 src += src_stride; 592 dst += dst_stride; 593 h -= 1; 594 } while (h > 0); 595 #endif 596 } 597 #if defined(__aarch64__) 598 } 599 #endif 600 } 601 602 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, 603 int dst_stride, int w, int h, 604 const InterpFilterParams *filter_params_x, 605 const InterpFilterParams *filter_params_y, 606 const int subpel_x_q4, const int subpel_y_q4, 607 ConvolveParams *conv_params) { 608 const int vert_offset = filter_params_y->taps / 2 - 1; 609 610 src -= vert_offset * src_stride; 611 612 (void)filter_params_x; 613 (void)subpel_x_q4; 614 (void)conv_params; 615 616 assert(conv_params->round_0 <= FILTER_BITS); 617 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || 618 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); 619 620 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 621 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 622 623 if (w <= 4) { 624 uint8x8_t d01; 625 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; 626 #if defined(__aarch64__) 627 uint8x8_t d23; 628 int16x4_t s8, s9, s10, d1, d2, d3; 629 #endif 630 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 631 src += src_stride; 632 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 633 src += src_stride; 634 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 635 src += src_stride; 636 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 637 src += src_stride; 638 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 639 src += src_stride; 640 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 641 src += src_stride; 642 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 643 src += src_stride; 644 645 do { 646 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 647 src += src_stride; 648 #if defined(__aarch64__) 649 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 650 src += src_stride; 651 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 652 src += src_stride; 653 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 654 src += src_stride; 655 656 __builtin_prefetch(dst + 0 * dst_stride); 657 __builtin_prefetch(dst + 1 * dst_stride); 658 __builtin_prefetch(dst + 2 * dst_stride); 659 __builtin_prefetch(dst + 3 * dst_stride); 660 __builtin_prefetch(src + 0 * src_stride); 661 __builtin_prefetch(src + 1 * src_stride); 662 __builtin_prefetch(src + 2 * src_stride); 663 __builtin_prefetch(src + 3 * src_stride); 664 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 665 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); 666 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); 667 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); 668 669 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); 670 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); 671 if ((w == 4) && (h != 2)) { 672 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 673 0); // 00 01 02 03 674 dst += dst_stride; 675 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 676 1); // 10 11 12 13 677 dst += dst_stride; 678 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 679 0); // 20 21 22 23 680 dst += dst_stride; 681 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 682 1); // 30 31 32 33 683 dst += dst_stride; 684 } else if ((w == 4) && (h == 2)) { 685 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 686 0); // 00 01 02 03 687 dst += dst_stride; 688 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 689 1); // 10 11 12 13 690 dst += dst_stride; 691 } else if ((w == 2) && (h != 2)) { 692 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01 693 dst += dst_stride; 694 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11 695 dst += dst_stride; 696 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0); // 20 21 697 dst += dst_stride; 698 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2); // 30 31 699 dst += dst_stride; 700 } else if ((w == 2) && (h == 2)) { 701 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01 702 dst += dst_stride; 703 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11 704 dst += dst_stride; 705 } 706 s0 = s4; 707 s1 = s5; 708 s2 = s6; 709 s3 = s7; 710 s4 = s8; 711 s5 = s9; 712 s6 = s10; 713 h -= 4; 714 #else 715 __builtin_prefetch(dst + 0 * dst_stride); 716 __builtin_prefetch(src + 0 * src_stride); 717 718 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 719 720 d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS); 721 722 if (w == 4) { 723 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); 724 dst += dst_stride; 725 } else if (w == 2) { 726 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); 727 dst += dst_stride; 728 } 729 s0 = s1; 730 s1 = s2; 731 s2 = s3; 732 s3 = s4; 733 s4 = s5; 734 s5 = s6; 735 s6 = s7; 736 h -= 1; 737 #endif 738 } while (h > 0); 739 } else { 740 int height; 741 const uint8_t *s; 742 uint8_t *d; 743 uint8x8_t t0; 744 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 745 #if defined(__aarch64__) 746 uint8x8_t t1, t2, t3; 747 int16x8_t s8, s9, s10; 748 #endif 749 do { 750 __builtin_prefetch(src + 0 * src_stride); 751 __builtin_prefetch(src + 1 * src_stride); 752 __builtin_prefetch(src + 2 * src_stride); 753 __builtin_prefetch(src + 3 * src_stride); 754 __builtin_prefetch(src + 4 * src_stride); 755 __builtin_prefetch(src + 5 * src_stride); 756 __builtin_prefetch(src + 6 * src_stride); 757 s = src; 758 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 759 s += src_stride; 760 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 761 s += src_stride; 762 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 763 s += src_stride; 764 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 765 s += src_stride; 766 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 767 s += src_stride; 768 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 769 s += src_stride; 770 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 771 s += src_stride; 772 d = dst; 773 height = h; 774 775 do { 776 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 777 s += src_stride; 778 #if defined(__aarch64__) 779 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 780 s += src_stride; 781 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 782 s += src_stride; 783 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 784 s += src_stride; 785 786 __builtin_prefetch(d + 0 * dst_stride); 787 __builtin_prefetch(d + 1 * dst_stride); 788 __builtin_prefetch(d + 2 * dst_stride); 789 __builtin_prefetch(d + 3 * dst_stride); 790 __builtin_prefetch(s + 0 * src_stride); 791 __builtin_prefetch(s + 1 * src_stride); 792 __builtin_prefetch(s + 2 * src_stride); 793 __builtin_prefetch(s + 3 * src_stride); 794 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 795 t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); 796 t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); 797 t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); 798 if (h != 2) { 799 vst1_u8(d, t0); 800 d += dst_stride; 801 vst1_u8(d, t1); 802 d += dst_stride; 803 vst1_u8(d, t2); 804 d += dst_stride; 805 vst1_u8(d, t3); 806 d += dst_stride; 807 } else { 808 vst1_u8(d, t0); 809 d += dst_stride; 810 vst1_u8(d, t1); 811 d += dst_stride; 812 } 813 s0 = s4; 814 s1 = s5; 815 s2 = s6; 816 s3 = s7; 817 s4 = s8; 818 s5 = s9; 819 s6 = s10; 820 height -= 4; 821 #else 822 __builtin_prefetch(d); 823 __builtin_prefetch(s); 824 825 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 826 827 vst1_u8(d, t0); 828 d += dst_stride; 829 830 s0 = s1; 831 s1 = s2; 832 s2 = s3; 833 s3 = s4; 834 s4 = s5; 835 s5 = s6; 836 s6 = s7; 837 height -= 1; 838 #endif 839 } while (height > 0); 840 src += 8; 841 dst += 8; 842 w -= 8; 843 } while (w > 0); 844 } 845 } 846 847 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, 848 int dst_stride, int w, int h, 849 const InterpFilterParams *filter_params_x, 850 const InterpFilterParams *filter_params_y, 851 const int subpel_x_q4, const int subpel_y_q4, 852 ConvolveParams *conv_params) { 853 int im_dst_stride; 854 int width, height; 855 uint8x8_t t0; 856 #if defined(__aarch64__) 857 uint8x8_t t1, t2, t3, t4, t5, t6, t7; 858 #endif 859 860 DECLARE_ALIGNED(16, int16_t, 861 im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); 862 863 const int bd = 8; 864 const int im_h = h + filter_params_y->taps - 1; 865 const int im_stride = MAX_SB_SIZE; 866 const int vert_offset = filter_params_y->taps / 2 - 1; 867 const int horiz_offset = filter_params_x->taps / 2 - 1; 868 869 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 870 const uint8_t *s; 871 int16_t *dst_ptr; 872 873 dst_ptr = im_block; 874 im_dst_stride = im_stride; 875 height = im_h; 876 width = w; 877 878 const int16_t round_bits = 879 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 880 const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); 881 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 882 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 883 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 884 885 int16_t x_filter_tmp[8]; 886 int16x8_t filter_x_coef = vld1q_s16(x_filter); 887 888 // filter coeffs are even, so downshifting by 1 to reduce intermediate 889 // precision requirements. 890 filter_x_coef = vshrq_n_s16(filter_x_coef, 1); 891 vst1q_s16(&x_filter_tmp[0], filter_x_coef); 892 893 assert(conv_params->round_0 > 0); 894 895 if (w <= 4) { 896 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; 897 #if defined(__aarch64__) 898 int16x4_t s8, s9, s10, d1, d2, d3; 899 #endif 900 901 const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); 902 const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1)); 903 904 do { 905 s = src_ptr; 906 907 #if defined(__aarch64__) 908 __builtin_prefetch(s + 0 * src_stride); 909 __builtin_prefetch(s + 1 * src_stride); 910 __builtin_prefetch(s + 2 * src_stride); 911 __builtin_prefetch(s + 3 * src_stride); 912 913 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 914 transpose_u8_8x4(&t0, &t1, &t2, &t3); 915 916 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 917 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 918 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 919 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 920 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 921 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 922 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 923 924 __builtin_prefetch(dst_ptr + 0 * im_dst_stride); 925 __builtin_prefetch(dst_ptr + 1 * im_dst_stride); 926 __builtin_prefetch(dst_ptr + 2 * im_dst_stride); 927 __builtin_prefetch(dst_ptr + 3 * im_dst_stride); 928 s += 7; 929 930 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 931 transpose_u8_8x4(&t0, &t1, &t2, &t3); 932 933 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 934 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 935 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 936 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 937 938 d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, 939 horiz_const, shift_round_0); 940 d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, 941 horiz_const, shift_round_0); 942 d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, 943 horiz_const, shift_round_0); 944 d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, 945 horiz_const, shift_round_0); 946 947 transpose_s16_4x4d(&d0, &d1, &d2, &d3); 948 if (w == 4) { 949 vst1_s16((dst_ptr + 0 * im_dst_stride), d0); 950 vst1_s16((dst_ptr + 1 * im_dst_stride), d1); 951 vst1_s16((dst_ptr + 2 * im_dst_stride), d2); 952 vst1_s16((dst_ptr + 3 * im_dst_stride), d3); 953 } else if (w == 2) { 954 vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride), 955 vreinterpret_u32_s16(d0), 0); 956 vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride), 957 vreinterpret_u32_s16(d1), 0); 958 vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride), 959 vreinterpret_u32_s16(d2), 0); 960 vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride), 961 vreinterpret_u32_s16(d3), 0); 962 } 963 src_ptr += 4 * src_stride; 964 dst_ptr += 4 * im_dst_stride; 965 height -= 4; 966 #else 967 int16x8_t tt0; 968 969 __builtin_prefetch(s); 970 971 t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 972 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 973 s0 = vget_low_s16(tt0); 974 s4 = vget_high_s16(tt0); 975 976 __builtin_prefetch(dst_ptr); 977 s += 8; 978 979 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 980 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 981 982 s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 983 s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 984 s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 985 s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 986 s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 987 s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 988 989 d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, 990 horiz_const, shift_round_0); 991 992 if (w == 4) { 993 vst1_s16(dst_ptr, d0); 994 dst_ptr += im_dst_stride; 995 } else if (w == 2) { 996 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0); 997 dst_ptr += im_dst_stride; 998 } 999 1000 src_ptr += src_stride; 1001 height -= 1; 1002 #endif 1003 } while (height > 0); 1004 } else { 1005 int16_t *d_tmp; 1006 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0; 1007 #if defined(__aarch64__) 1008 int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7; 1009 int16x8_t s11, s12, s13, s14; 1010 #endif 1011 1012 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); 1013 const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1)); 1014 1015 #if defined(__aarch64__) 1016 do { 1017 __builtin_prefetch(src_ptr + 0 * src_stride); 1018 __builtin_prefetch(src_ptr + 1 * src_stride); 1019 __builtin_prefetch(src_ptr + 2 * src_stride); 1020 __builtin_prefetch(src_ptr + 3 * src_stride); 1021 __builtin_prefetch(src_ptr + 4 * src_stride); 1022 __builtin_prefetch(src_ptr + 5 * src_stride); 1023 __builtin_prefetch(src_ptr + 6 * src_stride); 1024 __builtin_prefetch(src_ptr + 7 * src_stride); 1025 1026 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1027 1028 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1029 1030 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1031 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1032 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1033 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1034 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1035 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1036 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1037 1038 width = w; 1039 s = src_ptr + 7; 1040 d_tmp = dst_ptr; 1041 1042 __builtin_prefetch(dst_ptr + 0 * im_dst_stride); 1043 __builtin_prefetch(dst_ptr + 1 * im_dst_stride); 1044 __builtin_prefetch(dst_ptr + 2 * im_dst_stride); 1045 __builtin_prefetch(dst_ptr + 3 * im_dst_stride); 1046 __builtin_prefetch(dst_ptr + 4 * im_dst_stride); 1047 __builtin_prefetch(dst_ptr + 5 * im_dst_stride); 1048 __builtin_prefetch(dst_ptr + 6 * im_dst_stride); 1049 __builtin_prefetch(dst_ptr + 7 * im_dst_stride); 1050 1051 do { 1052 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1053 1054 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1055 1056 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1057 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1058 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1059 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1060 s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1061 s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1062 s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1063 s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1064 1065 res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, 1066 horiz_const, shift_round_0); 1067 res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, 1068 horiz_const, shift_round_0); 1069 res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, 1070 horiz_const, shift_round_0); 1071 res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, 1072 horiz_const, shift_round_0); 1073 res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp, 1074 horiz_const, shift_round_0); 1075 res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, 1076 x_filter_tmp, horiz_const, shift_round_0); 1077 res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, 1078 x_filter_tmp, horiz_const, shift_round_0); 1079 res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, 1080 x_filter_tmp, horiz_const, shift_round_0); 1081 1082 transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6, 1083 &res7); 1084 1085 store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5, 1086 res6, res7); 1087 1088 s0 = s8; 1089 s1 = s9; 1090 s2 = s10; 1091 s3 = s11; 1092 s4 = s12; 1093 s5 = s13; 1094 s6 = s14; 1095 s += 8; 1096 d_tmp += 8; 1097 width -= 8; 1098 } while (width > 0); 1099 src_ptr += 8 * src_stride; 1100 dst_ptr += 8 * im_dst_stride; 1101 height -= 8; 1102 } while (height > 0); 1103 #else 1104 do { 1105 t0 = vld1_u8(src_ptr); 1106 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 1107 1108 width = w; 1109 s = src_ptr + 8; 1110 d_tmp = dst_ptr; 1111 1112 __builtin_prefetch(dst_ptr); 1113 1114 do { 1115 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 1116 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1117 int16x8_t sum = s0; 1118 s0 = s7; 1119 1120 s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 1121 s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 1122 s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 1123 s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 1124 s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 1125 s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 1126 s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 1127 1128 res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, 1129 horiz_const, shift_round_0); 1130 1131 vst1q_s16(d_tmp, res0); 1132 1133 s += 8; 1134 d_tmp += 8; 1135 width -= 8; 1136 } while (width > 0); 1137 src_ptr += src_stride; 1138 dst_ptr += im_dst_stride; 1139 height -= 1; 1140 } while (height > 0); 1141 #endif 1142 } 1143 1144 // vertical 1145 { 1146 uint8_t *dst_u8_ptr, *d_u8; 1147 int16_t *v_src_ptr, *v_s; 1148 1149 const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) + 1150 (1 << (offset_bits - conv_params->round_1 - 1)); 1151 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 1152 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 1153 1154 const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1)); 1155 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 1156 const int32x4_t sub_const_vec = vdupq_n_s32(sub_const); 1157 1158 src_stride = im_stride; 1159 v_src_ptr = im_block; 1160 dst_u8_ptr = dst; 1161 1162 height = h; 1163 width = w; 1164 1165 if (width <= 4) { 1166 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; 1167 uint16x4_t d0; 1168 uint16x8_t dd0; 1169 uint8x8_t d01; 1170 1171 #if defined(__aarch64__) 1172 int16x4_t s8, s9, s10; 1173 uint16x4_t d1, d2, d3; 1174 uint16x8_t dd1; 1175 uint8x8_t d23; 1176 #endif 1177 1178 d_u8 = dst_u8_ptr; 1179 v_s = v_src_ptr; 1180 1181 __builtin_prefetch(v_s + 0 * im_stride); 1182 __builtin_prefetch(v_s + 1 * im_stride); 1183 __builtin_prefetch(v_s + 2 * im_stride); 1184 __builtin_prefetch(v_s + 3 * im_stride); 1185 __builtin_prefetch(v_s + 4 * im_stride); 1186 __builtin_prefetch(v_s + 5 * im_stride); 1187 __builtin_prefetch(v_s + 6 * im_stride); 1188 __builtin_prefetch(v_s + 7 * im_stride); 1189 1190 load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 1191 v_s += (7 * im_stride); 1192 1193 do { 1194 #if defined(__aarch64__) 1195 load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10); 1196 v_s += (im_stride << 2); 1197 1198 __builtin_prefetch(d_u8 + 0 * dst_stride); 1199 __builtin_prefetch(d_u8 + 1 * dst_stride); 1200 __builtin_prefetch(d_u8 + 2 * dst_stride); 1201 __builtin_prefetch(d_u8 + 3 * dst_stride); 1202 1203 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1204 round_shift_vec, offset_const, 1205 sub_const_vec); 1206 d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 1207 round_shift_vec, offset_const, 1208 sub_const_vec); 1209 d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 1210 round_shift_vec, offset_const, 1211 sub_const_vec); 1212 d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 1213 round_shift_vec, offset_const, 1214 sub_const_vec); 1215 1216 dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits); 1217 dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits); 1218 1219 d01 = vqmovn_u16(dd0); 1220 d23 = vqmovn_u16(dd1); 1221 1222 if ((w == 4) && (h != 2)) { 1223 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), 1224 0); // 00 01 02 03 1225 d_u8 += dst_stride; 1226 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), 1227 1); // 10 11 12 13 1228 d_u8 += dst_stride; 1229 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23), 1230 0); // 20 21 22 23 1231 d_u8 += dst_stride; 1232 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23), 1233 1); // 30 31 32 33 1234 d_u8 += dst_stride; 1235 } else if ((w == 2) && (h != 2)) { 1236 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), 1237 0); // 00 01 1238 d_u8 += dst_stride; 1239 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), 1240 2); // 10 11 1241 d_u8 += dst_stride; 1242 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23), 1243 0); // 20 21 1244 d_u8 += dst_stride; 1245 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23), 1246 2); // 30 31 1247 d_u8 += dst_stride; 1248 } else if ((w == 4) && (h == 2)) { 1249 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), 1250 0); // 00 01 02 03 1251 d_u8 += dst_stride; 1252 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), 1253 1); // 10 11 12 13 1254 d_u8 += dst_stride; 1255 } else if ((w == 2) && (h == 2)) { 1256 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), 1257 0); // 00 01 1258 d_u8 += dst_stride; 1259 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), 1260 2); // 10 11 1261 d_u8 += dst_stride; 1262 } 1263 1264 s0 = s4; 1265 s1 = s5; 1266 s2 = s6; 1267 s3 = s7; 1268 s4 = s8; 1269 s5 = s9; 1270 s6 = s10; 1271 height -= 4; 1272 #else 1273 s7 = vld1_s16(v_s); 1274 v_s += im_stride; 1275 1276 __builtin_prefetch(d_u8 + 0 * dst_stride); 1277 1278 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1279 round_shift_vec, offset_const, 1280 sub_const_vec); 1281 1282 dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits); 1283 d01 = vqmovn_u16(dd0); 1284 1285 if (w == 4) { 1286 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), 1287 0); // 00 01 02 03 1288 d_u8 += dst_stride; 1289 1290 } else if (w == 2) { 1291 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), 1292 0); // 00 01 1293 d_u8 += dst_stride; 1294 } 1295 1296 s0 = s1; 1297 s1 = s2; 1298 s2 = s3; 1299 s3 = s4; 1300 s4 = s5; 1301 s5 = s6; 1302 s6 = s7; 1303 height -= 1; 1304 #endif 1305 } while (height > 0); 1306 } else { 1307 // if width is a multiple of 8 & height is a multiple of 4 1308 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 1309 uint8x8_t res0; 1310 #if defined(__aarch64__) 1311 int16x8_t s8, s9, s10; 1312 uint8x8_t res1, res2, res3; 1313 #endif 1314 1315 do { 1316 __builtin_prefetch(v_src_ptr + 0 * im_stride); 1317 __builtin_prefetch(v_src_ptr + 1 * im_stride); 1318 __builtin_prefetch(v_src_ptr + 2 * im_stride); 1319 __builtin_prefetch(v_src_ptr + 3 * im_stride); 1320 __builtin_prefetch(v_src_ptr + 4 * im_stride); 1321 __builtin_prefetch(v_src_ptr + 5 * im_stride); 1322 __builtin_prefetch(v_src_ptr + 6 * im_stride); 1323 __builtin_prefetch(v_src_ptr + 7 * im_stride); 1324 1325 v_s = v_src_ptr; 1326 load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 1327 v_s += (7 * im_stride); 1328 1329 d_u8 = dst_u8_ptr; 1330 height = h; 1331 1332 do { 1333 #if defined(__aarch64__) 1334 load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10); 1335 v_s += (im_stride << 2); 1336 1337 __builtin_prefetch(d_u8 + 4 * dst_stride); 1338 __builtin_prefetch(d_u8 + 5 * dst_stride); 1339 __builtin_prefetch(d_u8 + 6 * dst_stride); 1340 __builtin_prefetch(d_u8 + 7 * dst_stride); 1341 1342 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, 1343 y_filter, round_shift_vec, offset_const, 1344 sub_const_vec, vec_round_bits); 1345 res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, 1346 y_filter, round_shift_vec, offset_const, 1347 sub_const_vec, vec_round_bits); 1348 res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, 1349 y_filter, round_shift_vec, offset_const, 1350 sub_const_vec, vec_round_bits); 1351 res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, 1352 y_filter, round_shift_vec, offset_const, 1353 sub_const_vec, vec_round_bits); 1354 1355 if (h != 2) { 1356 vst1_u8(d_u8, res0); 1357 d_u8 += dst_stride; 1358 vst1_u8(d_u8, res1); 1359 d_u8 += dst_stride; 1360 vst1_u8(d_u8, res2); 1361 d_u8 += dst_stride; 1362 vst1_u8(d_u8, res3); 1363 d_u8 += dst_stride; 1364 } else { 1365 vst1_u8(d_u8, res0); 1366 d_u8 += dst_stride; 1367 vst1_u8(d_u8, res1); 1368 d_u8 += dst_stride; 1369 } 1370 s0 = s4; 1371 s1 = s5; 1372 s2 = s6; 1373 s3 = s7; 1374 s4 = s8; 1375 s5 = s9; 1376 s6 = s10; 1377 height -= 4; 1378 #else 1379 s7 = vld1q_s16(v_s); 1380 v_s += im_stride; 1381 1382 __builtin_prefetch(d_u8 + 0 * dst_stride); 1383 1384 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, 1385 y_filter, round_shift_vec, offset_const, 1386 sub_const_vec, vec_round_bits); 1387 1388 vst1_u8(d_u8, res0); 1389 d_u8 += dst_stride; 1390 1391 s0 = s1; 1392 s1 = s2; 1393 s2 = s3; 1394 s3 = s4; 1395 s4 = s5; 1396 s5 = s6; 1397 s6 = s7; 1398 height -= 1; 1399 #endif 1400 } while (height > 0); 1401 v_src_ptr += 8; 1402 dst_u8_ptr += 8; 1403 w -= 8; 1404 } while (w > 0); 1405 } 1406 } 1407 } 1408 void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, 1409 uint8_t *dst, int dst_stride, int w, int h, 1410 const InterpFilterParams *filter_params_x, 1411 const InterpFilterParams *filter_params_y, 1412 const int subpel_x_q4, const int subpel_y_q4, 1413 ConvolveParams *conv_params) { 1414 (void)filter_params_x; 1415 (void)filter_params_y; 1416 (void)subpel_x_q4; 1417 (void)subpel_y_q4; 1418 (void)conv_params; 1419 1420 const uint8_t *src1; 1421 uint8_t *dst1; 1422 int y; 1423 1424 if (!(w & 0x0F)) { 1425 for (y = 0; y < h; ++y) { 1426 src1 = src; 1427 dst1 = dst; 1428 for (int x = 0; x < (w >> 4); ++x) { 1429 vst1q_u8(dst1, vld1q_u8(src1)); 1430 src1 += 16; 1431 dst1 += 16; 1432 } 1433 src += src_stride; 1434 dst += dst_stride; 1435 } 1436 } else if (!(w & 0x07)) { 1437 for (y = 0; y < h; ++y) { 1438 vst1_u8(dst, vld1_u8(src)); 1439 src += src_stride; 1440 dst += dst_stride; 1441 } 1442 } else if (!(w & 0x03)) { 1443 for (y = 0; y < h; ++y) { 1444 vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0); 1445 src += src_stride; 1446 dst += dst_stride; 1447 } 1448 } else if (!(w & 0x01)) { 1449 for (y = 0; y < h; ++y) { 1450 vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0); 1451 src += src_stride; 1452 dst += dst_stride; 1453 } 1454 } 1455 } 1456