1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include <assert.h> 13 14 #include "./vpx_config.h" 15 #include "./vpx_dsp_rtcd.h" 16 #include "vpx/vpx_integer.h" 17 #include "vpx_dsp/arm/transpose_neon.h" 18 #include "vpx_ports/mem.h" 19 20 // Note: 21 // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). 22 // 2. After refactoring the shared code in kernel loops with inline functions, 23 // the decoder speed dropped a lot when using gcc compiler. Therefore there is 24 // no refactoring for those parts by now. 25 // 3. For horizontal convolve, there is an alternative optimization that 26 // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 27 // samples in each are read from memory: src, (src+1), (src+2), (src+3), 28 // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract 29 // instructions. This optimization is much faster in speed unit test, but slowed 30 // down the whole decoder by 5%. 31 32 static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0, 33 uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) { 34 *s0 = vld1_u8(s); 35 s += p; 36 *s1 = vld1_u8(s); 37 s += p; 38 *s2 = vld1_u8(s); 39 s += p; 40 *s3 = vld1_u8(s); 41 } 42 43 static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0, 44 uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3, 45 uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6, 46 uint8x8_t *s7) { 47 *s0 = vld1_u8(s); 48 s += p; 49 *s1 = vld1_u8(s); 50 s += p; 51 *s2 = vld1_u8(s); 52 s += p; 53 *s3 = vld1_u8(s); 54 s += p; 55 *s4 = vld1_u8(s); 56 s += p; 57 *s5 = vld1_u8(s); 58 s += p; 59 *s6 = vld1_u8(s); 60 s += p; 61 *s7 = vld1_u8(s); 62 } 63 64 static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, 65 const uint8x8_t s1, const uint8x8_t s2, 66 const uint8x8_t s3, const uint8x8_t s4, 67 const uint8x8_t s5, const uint8x8_t s6, 68 const uint8x8_t s7) { 69 vst1_u8(s, s0); 70 s += p; 71 vst1_u8(s, s1); 72 s += p; 73 vst1_u8(s, s2); 74 s += p; 75 vst1_u8(s, s3); 76 s += p; 77 vst1_u8(s, s4); 78 s += p; 79 vst1_u8(s, s5); 80 s += p; 81 vst1_u8(s, s6); 82 s += p; 83 vst1_u8(s, s7); 84 } 85 86 static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2, 87 int16x4_t s3, int16x4_t s4, int16x4_t s5, 88 int16x4_t s6, int16x4_t s7, 89 int16x8_t filters, int16x4_t filter3, 90 int16x4_t filter4) { 91 const int16x4_t filters_lo = vget_low_s16(filters); 92 const int16x4_t filters_hi = vget_high_s16(filters); 93 int16x4_t sum = vdup_n_s16(0); 94 95 sum = vmla_lane_s16(sum, s0, filters_lo, 0); 96 sum = vmla_lane_s16(sum, s1, filters_lo, 1); 97 sum = vmla_lane_s16(sum, s2, filters_lo, 2); 98 sum = vmla_lane_s16(sum, s5, filters_hi, 1); 99 sum = vmla_lane_s16(sum, s6, filters_hi, 2); 100 sum = vmla_lane_s16(sum, s7, filters_hi, 3); 101 sum = vqadd_s16(sum, vmul_s16(s3, filter3)); 102 sum = vqadd_s16(sum, vmul_s16(s4, filter4)); 103 return sum; 104 } 105 106 static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2, 107 int16x8_t s3, int16x8_t s4, int16x8_t s5, 108 int16x8_t s6, int16x8_t s7, 109 int16x8_t filters, int16x8_t filter3, 110 int16x8_t filter4) { 111 const int16x4_t filters_lo = vget_low_s16(filters); 112 const int16x4_t filters_hi = vget_high_s16(filters); 113 int16x8_t sum = vdupq_n_s16(0); 114 115 sum = vmlaq_lane_s16(sum, s0, filters_lo, 0); 116 sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); 117 sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); 118 sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); 119 sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); 120 sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); 121 sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); 122 sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); 123 return sum; 124 } 125 126 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, 127 uint8_t *dst, ptrdiff_t dst_stride, 128 const int16_t *filter_x, int x_step_q4, 129 const int16_t *filter_y, // unused 130 int y_step_q4, // unused 131 int w, int h) { 132 const int16x8_t filters = vld1q_s16(filter_x); 133 uint8x8_t t0, t1, t2, t3; 134 135 assert(!((intptr_t)dst & 3)); 136 assert(!(dst_stride & 3)); 137 assert(x_step_q4 == 16); 138 139 (void)x_step_q4; 140 (void)y_step_q4; 141 (void)filter_y; 142 143 src -= 3; 144 145 if (h == 4) { 146 uint8x8_t d01, d23; 147 int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, 148 d1, d2, d3; 149 int16x8_t tt0, tt1, tt2, tt3; 150 151 __builtin_prefetch(src + 0 * src_stride); 152 __builtin_prefetch(src + 1 * src_stride); 153 __builtin_prefetch(src + 2 * src_stride); 154 __builtin_prefetch(src + 3 * src_stride); 155 filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 156 filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 157 load_8x4(src, src_stride, &t0, &t1, &t2, &t3); 158 transpose_u8_8x4(&t0, &t1, &t2, &t3); 159 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 160 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 161 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 162 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 163 s0 = vget_low_s16(tt0); 164 s1 = vget_low_s16(tt1); 165 s2 = vget_low_s16(tt2); 166 s3 = vget_low_s16(tt3); 167 s4 = vget_high_s16(tt0); 168 s5 = vget_high_s16(tt1); 169 s6 = vget_high_s16(tt2); 170 __builtin_prefetch(dst + 0 * dst_stride); 171 __builtin_prefetch(dst + 1 * dst_stride); 172 __builtin_prefetch(dst + 2 * dst_stride); 173 __builtin_prefetch(dst + 3 * dst_stride); 174 src += 7; 175 176 do { 177 load_8x4(src, src_stride, &t0, &t1, &t2, &t3); 178 transpose_u8_8x4(&t0, &t1, &t2, &t3); 179 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 180 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 181 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 182 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 183 s7 = vget_low_s16(tt0); 184 s8 = vget_low_s16(tt1); 185 s9 = vget_low_s16(tt2); 186 s10 = vget_low_s16(tt3); 187 188 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 189 filter4); 190 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 191 filter4); 192 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 193 filter4); 194 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 195 filter4); 196 197 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 198 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 199 transpose_u8_4x4(&d01, &d23); 200 201 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), 202 vreinterpret_u32_u8(d01), 0); 203 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), 204 vreinterpret_u32_u8(d23), 0); 205 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), 206 vreinterpret_u32_u8(d01), 1); 207 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), 208 vreinterpret_u32_u8(d23), 1); 209 210 s0 = s4; 211 s1 = s5; 212 s2 = s6; 213 s3 = s7; 214 s4 = s8; 215 s5 = s9; 216 s6 = s10; 217 src += 4; 218 dst += 4; 219 w -= 4; 220 } while (w > 0); 221 } else { 222 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 223 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 224 int width; 225 const uint8_t *s; 226 uint8x8_t t4, t5, t6, t7; 227 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 228 229 if (w == 4) { 230 do { 231 load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 232 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 233 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 234 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 235 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 236 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 237 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 238 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 239 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 240 241 load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 242 src += 8 * src_stride; 243 __builtin_prefetch(dst + 0 * dst_stride); 244 __builtin_prefetch(dst + 1 * dst_stride); 245 __builtin_prefetch(dst + 2 * dst_stride); 246 __builtin_prefetch(dst + 3 * dst_stride); 247 __builtin_prefetch(dst + 4 * dst_stride); 248 __builtin_prefetch(dst + 5 * dst_stride); 249 __builtin_prefetch(dst + 6 * dst_stride); 250 __builtin_prefetch(dst + 7 * dst_stride); 251 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 252 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 253 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 254 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 255 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 256 257 __builtin_prefetch(src + 0 * src_stride); 258 __builtin_prefetch(src + 1 * src_stride); 259 __builtin_prefetch(src + 2 * src_stride); 260 __builtin_prefetch(src + 3 * src_stride); 261 __builtin_prefetch(src + 4 * src_stride); 262 __builtin_prefetch(src + 5 * src_stride); 263 __builtin_prefetch(src + 6 * src_stride); 264 __builtin_prefetch(src + 7 * src_stride); 265 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 266 filter4); 267 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 268 filter4); 269 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 270 filter4); 271 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 272 filter4); 273 274 t0 = vqrshrun_n_s16(d0, 7); 275 t1 = vqrshrun_n_s16(d1, 7); 276 t2 = vqrshrun_n_s16(d2, 7); 277 t3 = vqrshrun_n_s16(d3, 7); 278 transpose_u8_8x4(&t0, &t1, &t2, &t3); 279 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); 280 dst += dst_stride; 281 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0); 282 dst += dst_stride; 283 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0); 284 dst += dst_stride; 285 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0); 286 dst += dst_stride; 287 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1); 288 dst += dst_stride; 289 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1); 290 dst += dst_stride; 291 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1); 292 dst += dst_stride; 293 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1); 294 dst += dst_stride; 295 h -= 8; 296 } while (h > 0); 297 } else { 298 uint8_t *d; 299 int16x8_t s11, s12, s13, s14, d4, d5, d6, d7; 300 301 do { 302 __builtin_prefetch(src + 0 * src_stride); 303 __builtin_prefetch(src + 1 * src_stride); 304 __builtin_prefetch(src + 2 * src_stride); 305 __builtin_prefetch(src + 3 * src_stride); 306 __builtin_prefetch(src + 4 * src_stride); 307 __builtin_prefetch(src + 5 * src_stride); 308 __builtin_prefetch(src + 6 * src_stride); 309 __builtin_prefetch(src + 7 * src_stride); 310 load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 311 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 312 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 313 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 314 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 315 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 316 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 317 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 318 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 319 320 width = w; 321 s = src + 7; 322 d = dst; 323 __builtin_prefetch(dst + 0 * dst_stride); 324 __builtin_prefetch(dst + 1 * dst_stride); 325 __builtin_prefetch(dst + 2 * dst_stride); 326 __builtin_prefetch(dst + 3 * dst_stride); 327 __builtin_prefetch(dst + 4 * dst_stride); 328 __builtin_prefetch(dst + 5 * dst_stride); 329 __builtin_prefetch(dst + 6 * dst_stride); 330 __builtin_prefetch(dst + 7 * dst_stride); 331 332 do { 333 load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 334 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 335 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 336 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 337 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 338 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 339 s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 340 s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 341 s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 342 s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 343 344 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 345 filter4); 346 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 347 filter4); 348 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 349 filter4); 350 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 351 filter4); 352 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, 353 filter4); 354 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, 355 filter4); 356 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, 357 filter4); 358 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, 359 filter3, filter4); 360 361 t0 = vqrshrun_n_s16(d0, 7); 362 t1 = vqrshrun_n_s16(d1, 7); 363 t2 = vqrshrun_n_s16(d2, 7); 364 t3 = vqrshrun_n_s16(d3, 7); 365 t4 = vqrshrun_n_s16(d4, 7); 366 t5 = vqrshrun_n_s16(d5, 7); 367 t6 = vqrshrun_n_s16(d6, 7); 368 t7 = vqrshrun_n_s16(d7, 7); 369 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 370 store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); 371 372 s0 = s8; 373 s1 = s9; 374 s2 = s10; 375 s3 = s11; 376 s4 = s12; 377 s5 = s13; 378 s6 = s14; 379 s += 8; 380 d += 8; 381 width -= 8; 382 } while (width > 0); 383 src += 8 * src_stride; 384 dst += 8 * dst_stride; 385 h -= 8; 386 } while (h > 0); 387 } 388 } 389 } 390 391 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, 392 uint8_t *dst, ptrdiff_t dst_stride, 393 const int16_t *filter_x, int x_step_q4, 394 const int16_t *filter_y, // unused 395 int y_step_q4, // unused 396 int w, int h) { 397 const int16x8_t filters = vld1q_s16(filter_x); 398 uint8x8_t t0, t1, t2, t3; 399 400 assert(!((intptr_t)dst & 3)); 401 assert(!(dst_stride & 3)); 402 assert(x_step_q4 == 16); 403 404 (void)x_step_q4; 405 (void)y_step_q4; 406 (void)filter_y; 407 408 src -= 3; 409 410 if (h == 4) { 411 uint8x8_t d01, d23; 412 int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, 413 d1, d2, d3; 414 int16x8_t tt0, tt1, tt2, tt3; 415 uint32x4_t d0123 = vdupq_n_u32(0); 416 417 __builtin_prefetch(src + 0 * src_stride); 418 __builtin_prefetch(src + 1 * src_stride); 419 __builtin_prefetch(src + 2 * src_stride); 420 __builtin_prefetch(src + 3 * src_stride); 421 filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 422 filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 423 load_8x4(src, src_stride, &t0, &t1, &t2, &t3); 424 transpose_u8_8x4(&t0, &t1, &t2, &t3); 425 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 426 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 427 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 428 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 429 s0 = vget_low_s16(tt0); 430 s1 = vget_low_s16(tt1); 431 s2 = vget_low_s16(tt2); 432 s3 = vget_low_s16(tt3); 433 s4 = vget_high_s16(tt0); 434 s5 = vget_high_s16(tt1); 435 s6 = vget_high_s16(tt2); 436 __builtin_prefetch(dst + 0 * dst_stride); 437 __builtin_prefetch(dst + 1 * dst_stride); 438 __builtin_prefetch(dst + 2 * dst_stride); 439 __builtin_prefetch(dst + 3 * dst_stride); 440 src += 7; 441 442 do { 443 load_8x4(src, src_stride, &t0, &t1, &t2, &t3); 444 transpose_u8_8x4(&t0, &t1, &t2, &t3); 445 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 446 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 447 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 448 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 449 s7 = vget_low_s16(tt0); 450 s8 = vget_low_s16(tt1); 451 s9 = vget_low_s16(tt2); 452 s10 = vget_low_s16(tt3); 453 454 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 455 filter4); 456 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 457 filter4); 458 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 459 filter4); 460 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 461 filter4); 462 463 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 464 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 465 transpose_u8_4x4(&d01, &d23); 466 467 d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); 468 d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); 469 d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); 470 d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); 471 d0123 = vreinterpretq_u32_u8( 472 vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); 473 474 vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); 475 vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); 476 vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); 477 vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); 478 479 s0 = s4; 480 s1 = s5; 481 s2 = s6; 482 s3 = s7; 483 s4 = s8; 484 s5 = s9; 485 s6 = s10; 486 src += 4; 487 dst += 4; 488 w -= 4; 489 } while (w > 0); 490 } else { 491 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 492 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 493 int width; 494 const uint8_t *s; 495 uint8x8_t t4, t5, t6, t7; 496 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 497 498 if (w == 4) { 499 uint32x4_t d0415 = vdupq_n_u32(0); 500 uint32x4_t d2637 = vdupq_n_u32(0); 501 do { 502 load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 503 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 504 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 505 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 506 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 507 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 508 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 509 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 510 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 511 512 load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 513 src += 8 * src_stride; 514 __builtin_prefetch(dst + 0 * dst_stride); 515 __builtin_prefetch(dst + 1 * dst_stride); 516 __builtin_prefetch(dst + 2 * dst_stride); 517 __builtin_prefetch(dst + 3 * dst_stride); 518 __builtin_prefetch(dst + 4 * dst_stride); 519 __builtin_prefetch(dst + 5 * dst_stride); 520 __builtin_prefetch(dst + 6 * dst_stride); 521 __builtin_prefetch(dst + 7 * dst_stride); 522 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 523 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 524 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 525 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 526 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 527 528 __builtin_prefetch(src + 0 * src_stride); 529 __builtin_prefetch(src + 1 * src_stride); 530 __builtin_prefetch(src + 2 * src_stride); 531 __builtin_prefetch(src + 3 * src_stride); 532 __builtin_prefetch(src + 4 * src_stride); 533 __builtin_prefetch(src + 5 * src_stride); 534 __builtin_prefetch(src + 6 * src_stride); 535 __builtin_prefetch(src + 7 * src_stride); 536 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 537 filter4); 538 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 539 filter4); 540 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 541 filter4); 542 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 543 filter4); 544 545 t0 = vqrshrun_n_s16(d0, 7); 546 t1 = vqrshrun_n_s16(d1, 7); 547 t2 = vqrshrun_n_s16(d2, 7); 548 t3 = vqrshrun_n_s16(d3, 7); 549 transpose_u8_8x4(&t0, &t1, &t2, &t3); 550 551 d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); 552 d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2); 553 d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0); 554 d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2); 555 d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1); 556 d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3); 557 d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1); 558 d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3); 559 d0415 = vreinterpretq_u32_u8( 560 vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1))); 561 d2637 = vreinterpretq_u32_u8( 562 vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3))); 563 564 vst1q_lane_u32((uint32_t *)dst, d0415, 0); 565 dst += dst_stride; 566 vst1q_lane_u32((uint32_t *)dst, d0415, 2); 567 dst += dst_stride; 568 vst1q_lane_u32((uint32_t *)dst, d2637, 0); 569 dst += dst_stride; 570 vst1q_lane_u32((uint32_t *)dst, d2637, 2); 571 dst += dst_stride; 572 vst1q_lane_u32((uint32_t *)dst, d0415, 1); 573 dst += dst_stride; 574 vst1q_lane_u32((uint32_t *)dst, d0415, 3); 575 dst += dst_stride; 576 vst1q_lane_u32((uint32_t *)dst, d2637, 1); 577 dst += dst_stride; 578 vst1q_lane_u32((uint32_t *)dst, d2637, 3); 579 dst += dst_stride; 580 h -= 8; 581 } while (h > 0); 582 } else { 583 uint8_t *d; 584 int16x8_t s11, s12, s13, s14, d4, d5, d6, d7; 585 uint8x16_t d01, d23, d45, d67; 586 587 do { 588 __builtin_prefetch(src + 0 * src_stride); 589 __builtin_prefetch(src + 1 * src_stride); 590 __builtin_prefetch(src + 2 * src_stride); 591 __builtin_prefetch(src + 3 * src_stride); 592 __builtin_prefetch(src + 4 * src_stride); 593 __builtin_prefetch(src + 5 * src_stride); 594 __builtin_prefetch(src + 6 * src_stride); 595 __builtin_prefetch(src + 7 * src_stride); 596 load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 597 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 598 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 599 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 600 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 601 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 602 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 603 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 604 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 605 606 width = w; 607 s = src + 7; 608 d = dst; 609 __builtin_prefetch(dst + 0 * dst_stride); 610 __builtin_prefetch(dst + 1 * dst_stride); 611 __builtin_prefetch(dst + 2 * dst_stride); 612 __builtin_prefetch(dst + 3 * dst_stride); 613 __builtin_prefetch(dst + 4 * dst_stride); 614 __builtin_prefetch(dst + 5 * dst_stride); 615 __builtin_prefetch(dst + 6 * dst_stride); 616 __builtin_prefetch(dst + 7 * dst_stride); 617 618 do { 619 load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 620 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 621 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 622 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 623 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 624 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 625 s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 626 s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 627 s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 628 s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 629 630 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 631 filter4); 632 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 633 filter4); 634 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 635 filter4); 636 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 637 filter4); 638 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, 639 filter4); 640 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, 641 filter4); 642 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, 643 filter4); 644 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, 645 filter3, filter4); 646 647 t0 = vqrshrun_n_s16(d0, 7); 648 t1 = vqrshrun_n_s16(d1, 7); 649 t2 = vqrshrun_n_s16(d2, 7); 650 t3 = vqrshrun_n_s16(d3, 7); 651 t4 = vqrshrun_n_s16(d4, 7); 652 t5 = vqrshrun_n_s16(d5, 7); 653 t6 = vqrshrun_n_s16(d6, 7); 654 t7 = vqrshrun_n_s16(d7, 7); 655 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 656 657 d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), 658 vld1_u8(d + 1 * dst_stride)); 659 d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), 660 vld1_u8(d + 3 * dst_stride)); 661 d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride), 662 vld1_u8(d + 5 * dst_stride)); 663 d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride), 664 vld1_u8(d + 7 * dst_stride)); 665 d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1)); 666 d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3)); 667 d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); 668 d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); 669 670 store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), 671 vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), 672 vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); 673 674 s0 = s8; 675 s1 = s9; 676 s2 = s10; 677 s3 = s11; 678 s4 = s12; 679 s5 = s13; 680 s6 = s14; 681 s += 8; 682 d += 8; 683 width -= 8; 684 } while (width > 0); 685 src += 8 * src_stride; 686 dst += 8 * dst_stride; 687 h -= 8; 688 } while (h > 0); 689 } 690 } 691 } 692 693 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, 694 uint8_t *dst, ptrdiff_t dst_stride, 695 const int16_t *filter_x, // unused 696 int x_step_q4, // unused 697 const int16_t *filter_y, int y_step_q4, int w, 698 int h) { 699 const int16x8_t filters = vld1q_s16(filter_y); 700 701 assert(!((intptr_t)dst & 3)); 702 assert(!(dst_stride & 3)); 703 assert(y_step_q4 == 16); 704 705 (void)x_step_q4; 706 (void)y_step_q4; 707 (void)filter_x; 708 709 src -= 3 * src_stride; 710 711 if (w == 4) { 712 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 713 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 714 uint8x8_t d01, d23; 715 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 716 717 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 718 src += src_stride; 719 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 720 src += src_stride; 721 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 722 src += src_stride; 723 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 724 src += src_stride; 725 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 726 src += src_stride; 727 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 728 src += src_stride; 729 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 730 src += src_stride; 731 732 do { 733 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 734 src += src_stride; 735 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 736 src += src_stride; 737 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 738 src += src_stride; 739 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 740 src += src_stride; 741 742 __builtin_prefetch(dst + 0 * dst_stride); 743 __builtin_prefetch(dst + 1 * dst_stride); 744 __builtin_prefetch(dst + 2 * dst_stride); 745 __builtin_prefetch(dst + 3 * dst_stride); 746 __builtin_prefetch(src + 0 * src_stride); 747 __builtin_prefetch(src + 1 * src_stride); 748 __builtin_prefetch(src + 2 * src_stride); 749 __builtin_prefetch(src + 3 * src_stride); 750 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 751 filter4); 752 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 753 filter4); 754 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 755 filter4); 756 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 757 filter4); 758 759 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 760 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 761 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); 762 dst += dst_stride; 763 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); 764 dst += dst_stride; 765 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); 766 dst += dst_stride; 767 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); 768 dst += dst_stride; 769 770 s0 = s4; 771 s1 = s5; 772 s2 = s6; 773 s3 = s7; 774 s4 = s8; 775 s5 = s9; 776 s6 = s10; 777 h -= 4; 778 } while (h > 0); 779 } else { 780 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 781 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 782 int height; 783 const uint8_t *s; 784 uint8_t *d; 785 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 786 787 do { 788 __builtin_prefetch(src + 0 * src_stride); 789 __builtin_prefetch(src + 1 * src_stride); 790 __builtin_prefetch(src + 2 * src_stride); 791 __builtin_prefetch(src + 3 * src_stride); 792 __builtin_prefetch(src + 4 * src_stride); 793 __builtin_prefetch(src + 5 * src_stride); 794 __builtin_prefetch(src + 6 * src_stride); 795 s = src; 796 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 797 s += src_stride; 798 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 799 s += src_stride; 800 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 801 s += src_stride; 802 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 803 s += src_stride; 804 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 805 s += src_stride; 806 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 807 s += src_stride; 808 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 809 s += src_stride; 810 d = dst; 811 height = h; 812 813 do { 814 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 815 s += src_stride; 816 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 817 s += src_stride; 818 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 819 s += src_stride; 820 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 821 s += src_stride; 822 823 __builtin_prefetch(d + 0 * dst_stride); 824 __builtin_prefetch(d + 1 * dst_stride); 825 __builtin_prefetch(d + 2 * dst_stride); 826 __builtin_prefetch(d + 3 * dst_stride); 827 __builtin_prefetch(s + 0 * src_stride); 828 __builtin_prefetch(s + 1 * src_stride); 829 __builtin_prefetch(s + 2 * src_stride); 830 __builtin_prefetch(s + 3 * src_stride); 831 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 832 filter4); 833 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 834 filter4); 835 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 836 filter4); 837 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 838 filter4); 839 840 vst1_u8(d, vqrshrun_n_s16(d0, 7)); 841 d += dst_stride; 842 vst1_u8(d, vqrshrun_n_s16(d1, 7)); 843 d += dst_stride; 844 vst1_u8(d, vqrshrun_n_s16(d2, 7)); 845 d += dst_stride; 846 vst1_u8(d, vqrshrun_n_s16(d3, 7)); 847 d += dst_stride; 848 849 s0 = s4; 850 s1 = s5; 851 s2 = s6; 852 s3 = s7; 853 s4 = s8; 854 s5 = s9; 855 s6 = s10; 856 height -= 4; 857 } while (height > 0); 858 src += 8; 859 dst += 8; 860 w -= 8; 861 } while (w > 0); 862 } 863 } 864 865 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, 866 uint8_t *dst, ptrdiff_t dst_stride, 867 const int16_t *filter_x, // unused 868 int x_step_q4, // unused 869 const int16_t *filter_y, int y_step_q4, int w, 870 int h) { 871 const int16x8_t filters = vld1q_s16(filter_y); 872 873 assert(!((intptr_t)dst & 3)); 874 assert(!(dst_stride & 3)); 875 assert(y_step_q4 == 16); 876 877 (void)x_step_q4; 878 (void)y_step_q4; 879 (void)filter_x; 880 881 src -= 3 * src_stride; 882 883 if (w == 4) { 884 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 885 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 886 uint8x8_t d01, d23; 887 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 888 uint32x4_t d0123 = vdupq_n_u32(0); 889 890 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 891 src += src_stride; 892 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 893 src += src_stride; 894 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 895 src += src_stride; 896 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 897 src += src_stride; 898 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 899 src += src_stride; 900 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 901 src += src_stride; 902 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 903 src += src_stride; 904 905 do { 906 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 907 src += src_stride; 908 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 909 src += src_stride; 910 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 911 src += src_stride; 912 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 913 src += src_stride; 914 915 __builtin_prefetch(dst + 0 * dst_stride); 916 __builtin_prefetch(dst + 1 * dst_stride); 917 __builtin_prefetch(dst + 2 * dst_stride); 918 __builtin_prefetch(dst + 3 * dst_stride); 919 __builtin_prefetch(src + 0 * src_stride); 920 __builtin_prefetch(src + 1 * src_stride); 921 __builtin_prefetch(src + 2 * src_stride); 922 __builtin_prefetch(src + 3 * src_stride); 923 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 924 filter4); 925 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 926 filter4); 927 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 928 filter4); 929 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 930 filter4); 931 932 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 933 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 934 935 d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); 936 d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1); 937 d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2); 938 d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); 939 d0123 = vreinterpretq_u32_u8( 940 vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); 941 942 vst1q_lane_u32((uint32_t *)dst, d0123, 0); 943 dst += dst_stride; 944 vst1q_lane_u32((uint32_t *)dst, d0123, 1); 945 dst += dst_stride; 946 vst1q_lane_u32((uint32_t *)dst, d0123, 2); 947 dst += dst_stride; 948 vst1q_lane_u32((uint32_t *)dst, d0123, 3); 949 dst += dst_stride; 950 951 s0 = s4; 952 s1 = s5; 953 s2 = s6; 954 s3 = s7; 955 s4 = s8; 956 s5 = s9; 957 s6 = s10; 958 h -= 4; 959 } while (h > 0); 960 } else { 961 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 962 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 963 int height; 964 const uint8_t *s; 965 uint8_t *d; 966 uint8x16_t d01, d23, dd01, dd23; 967 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 968 969 do { 970 __builtin_prefetch(src + 0 * src_stride); 971 __builtin_prefetch(src + 1 * src_stride); 972 __builtin_prefetch(src + 2 * src_stride); 973 __builtin_prefetch(src + 3 * src_stride); 974 __builtin_prefetch(src + 4 * src_stride); 975 __builtin_prefetch(src + 5 * src_stride); 976 __builtin_prefetch(src + 6 * src_stride); 977 s = src; 978 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 979 s += src_stride; 980 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 981 s += src_stride; 982 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 983 s += src_stride; 984 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 985 s += src_stride; 986 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 987 s += src_stride; 988 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 989 s += src_stride; 990 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 991 s += src_stride; 992 d = dst; 993 height = h; 994 995 do { 996 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 997 s += src_stride; 998 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 999 s += src_stride; 1000 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 1001 s += src_stride; 1002 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 1003 s += src_stride; 1004 1005 __builtin_prefetch(d + 0 * dst_stride); 1006 __builtin_prefetch(d + 1 * dst_stride); 1007 __builtin_prefetch(d + 2 * dst_stride); 1008 __builtin_prefetch(d + 3 * dst_stride); 1009 __builtin_prefetch(s + 0 * src_stride); 1010 __builtin_prefetch(s + 1 * src_stride); 1011 __builtin_prefetch(s + 2 * src_stride); 1012 __builtin_prefetch(s + 3 * src_stride); 1013 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 1014 filter4); 1015 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 1016 filter4); 1017 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 1018 filter4); 1019 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 1020 filter4); 1021 1022 d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7)); 1023 d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7)); 1024 dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), 1025 vld1_u8(d + 1 * dst_stride)); 1026 dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), 1027 vld1_u8(d + 3 * dst_stride)); 1028 dd01 = vrhaddq_u8(dd01, d01); 1029 dd23 = vrhaddq_u8(dd23, d23); 1030 1031 vst1_u8(d, vget_low_u8(dd01)); 1032 d += dst_stride; 1033 vst1_u8(d, vget_high_u8(dd01)); 1034 d += dst_stride; 1035 vst1_u8(d, vget_low_u8(dd23)); 1036 d += dst_stride; 1037 vst1_u8(d, vget_high_u8(dd23)); 1038 d += dst_stride; 1039 1040 s0 = s4; 1041 s1 = s5; 1042 s2 = s6; 1043 s3 = s7; 1044 s4 = s8; 1045 s5 = s9; 1046 s6 = s10; 1047 height -= 4; 1048 } while (height > 0); 1049 src += 8; 1050 dst += 8; 1051 w -= 8; 1052 } while (w > 0); 1053 } 1054 } 1055