1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include <assert.h> 13 14 #include "./vpx_config.h" 15 #include "./vpx_dsp_rtcd.h" 16 #include "vpx/vpx_integer.h" 17 #include "vpx_dsp/arm/transpose_neon.h" 18 #include "vpx_dsp/arm/vpx_convolve8_neon.h" 19 #include "vpx_ports/mem.h" 20 21 // Note: 22 // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). 23 // 2. After refactoring the shared code in kernel loops with inline functions, 24 // the decoder speed dropped a lot when using gcc compiler. Therefore there is 25 // no refactoring for those parts by now. 26 // 3. For horizontal convolve, there is an alternative optimization that 27 // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 28 // samples in each are read from memory: src, (src+1), (src+2), (src+3), 29 // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract 30 // instructions. This optimization is much faster in speed unit test, but slowed 31 // down the whole decoder by 5%. 32 33 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, 34 const uint8x8_t s0, const uint8x8_t s1, 35 const uint8x8_t s2, const uint8x8_t s3, 36 const uint8x8_t s4, const uint8x8_t s5, 37 const uint8x8_t s6, const uint8x8_t s7) { 38 vst1_u8(s, s0); 39 s += p; 40 vst1_u8(s, s1); 41 s += p; 42 vst1_u8(s, s2); 43 s += p; 44 vst1_u8(s, s3); 45 s += p; 46 vst1_u8(s, s4); 47 s += p; 48 vst1_u8(s, s5); 49 s += p; 50 vst1_u8(s, s6); 51 s += p; 52 vst1_u8(s, s7); 53 } 54 55 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, 56 uint8_t *dst, ptrdiff_t dst_stride, 57 const InterpKernel *filter, int x0_q4, 58 int x_step_q4, int y0_q4, int y_step_q4, int w, 59 int h) { 60 const int16x8_t filters = vld1q_s16(filter[x0_q4]); 61 uint8x8_t t0, t1, t2, t3; 62 63 assert(!((intptr_t)dst & 3)); 64 assert(!(dst_stride & 3)); 65 assert(x_step_q4 == 16); 66 67 (void)x_step_q4; 68 (void)y0_q4; 69 (void)y_step_q4; 70 71 src -= 3; 72 73 if (h == 4) { 74 uint8x8_t d01, d23; 75 int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, 76 d1, d2, d3; 77 int16x8_t tt0, tt1, tt2, tt3; 78 79 __builtin_prefetch(src + 0 * src_stride); 80 __builtin_prefetch(src + 1 * src_stride); 81 __builtin_prefetch(src + 2 * src_stride); 82 __builtin_prefetch(src + 3 * src_stride); 83 filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 84 filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 85 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 86 transpose_u8_8x4(&t0, &t1, &t2, &t3); 87 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 88 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 89 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 90 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 91 s0 = vget_low_s16(tt0); 92 s1 = vget_low_s16(tt1); 93 s2 = vget_low_s16(tt2); 94 s3 = vget_low_s16(tt3); 95 s4 = vget_high_s16(tt0); 96 s5 = vget_high_s16(tt1); 97 s6 = vget_high_s16(tt2); 98 __builtin_prefetch(dst + 0 * dst_stride); 99 __builtin_prefetch(dst + 1 * dst_stride); 100 __builtin_prefetch(dst + 2 * dst_stride); 101 __builtin_prefetch(dst + 3 * dst_stride); 102 src += 7; 103 104 do { 105 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 106 transpose_u8_8x4(&t0, &t1, &t2, &t3); 107 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 108 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 109 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 110 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 111 s7 = vget_low_s16(tt0); 112 s8 = vget_low_s16(tt1); 113 s9 = vget_low_s16(tt2); 114 s10 = vget_low_s16(tt3); 115 116 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 117 filter4); 118 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 119 filter4); 120 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 121 filter4); 122 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 123 filter4); 124 125 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 126 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 127 transpose_u8_4x4(&d01, &d23); 128 129 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), 130 vreinterpret_u32_u8(d01), 0); 131 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), 132 vreinterpret_u32_u8(d23), 0); 133 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), 134 vreinterpret_u32_u8(d01), 1); 135 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), 136 vreinterpret_u32_u8(d23), 1); 137 138 s0 = s4; 139 s1 = s5; 140 s2 = s6; 141 s3 = s7; 142 s4 = s8; 143 s5 = s9; 144 s6 = s10; 145 src += 4; 146 dst += 4; 147 w -= 4; 148 } while (w > 0); 149 } else { 150 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 151 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 152 int width; 153 const uint8_t *s; 154 uint8x8_t t4, t5, t6, t7; 155 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 156 157 if (w == 4) { 158 do { 159 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 160 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 161 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 162 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 163 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 164 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 165 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 166 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 167 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 168 169 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, 170 &t7); 171 src += 8 * src_stride; 172 __builtin_prefetch(dst + 0 * dst_stride); 173 __builtin_prefetch(dst + 1 * dst_stride); 174 __builtin_prefetch(dst + 2 * dst_stride); 175 __builtin_prefetch(dst + 3 * dst_stride); 176 __builtin_prefetch(dst + 4 * dst_stride); 177 __builtin_prefetch(dst + 5 * dst_stride); 178 __builtin_prefetch(dst + 6 * dst_stride); 179 __builtin_prefetch(dst + 7 * dst_stride); 180 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); 181 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 182 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 183 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 184 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 185 186 __builtin_prefetch(src + 0 * src_stride); 187 __builtin_prefetch(src + 1 * src_stride); 188 __builtin_prefetch(src + 2 * src_stride); 189 __builtin_prefetch(src + 3 * src_stride); 190 __builtin_prefetch(src + 4 * src_stride); 191 __builtin_prefetch(src + 5 * src_stride); 192 __builtin_prefetch(src + 6 * src_stride); 193 __builtin_prefetch(src + 7 * src_stride); 194 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 195 filter4); 196 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 197 filter4); 198 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 199 filter4); 200 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 201 filter4); 202 203 transpose_u8_8x4(&t0, &t1, &t2, &t3); 204 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); 205 dst += dst_stride; 206 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0); 207 dst += dst_stride; 208 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0); 209 dst += dst_stride; 210 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0); 211 dst += dst_stride; 212 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1); 213 dst += dst_stride; 214 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1); 215 dst += dst_stride; 216 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1); 217 dst += dst_stride; 218 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1); 219 dst += dst_stride; 220 h -= 8; 221 } while (h > 0); 222 } else { 223 uint8_t *d; 224 int16x8_t s11, s12, s13, s14; 225 226 do { 227 __builtin_prefetch(src + 0 * src_stride); 228 __builtin_prefetch(src + 1 * src_stride); 229 __builtin_prefetch(src + 2 * src_stride); 230 __builtin_prefetch(src + 3 * src_stride); 231 __builtin_prefetch(src + 4 * src_stride); 232 __builtin_prefetch(src + 5 * src_stride); 233 __builtin_prefetch(src + 6 * src_stride); 234 __builtin_prefetch(src + 7 * src_stride); 235 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 236 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 237 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 238 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 239 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 240 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 241 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 242 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 243 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 244 245 width = w; 246 s = src + 7; 247 d = dst; 248 __builtin_prefetch(dst + 0 * dst_stride); 249 __builtin_prefetch(dst + 1 * dst_stride); 250 __builtin_prefetch(dst + 2 * dst_stride); 251 __builtin_prefetch(dst + 3 * dst_stride); 252 __builtin_prefetch(dst + 4 * dst_stride); 253 __builtin_prefetch(dst + 5 * dst_stride); 254 __builtin_prefetch(dst + 6 * dst_stride); 255 __builtin_prefetch(dst + 7 * dst_stride); 256 257 do { 258 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 259 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 260 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 261 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 262 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 263 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 264 s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 265 s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 266 s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 267 s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 268 269 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 270 filter4); 271 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 272 filter4); 273 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 274 filter4); 275 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 276 filter4); 277 t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, 278 filter4); 279 t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, 280 filter4); 281 t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, 282 filter4); 283 t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, 284 filter3, filter4); 285 286 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 287 store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); 288 289 s0 = s8; 290 s1 = s9; 291 s2 = s10; 292 s3 = s11; 293 s4 = s12; 294 s5 = s13; 295 s6 = s14; 296 s += 8; 297 d += 8; 298 width -= 8; 299 } while (width > 0); 300 src += 8 * src_stride; 301 dst += 8 * dst_stride; 302 h -= 8; 303 } while (h > 0); 304 } 305 } 306 } 307 308 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, 309 uint8_t *dst, ptrdiff_t dst_stride, 310 const InterpKernel *filter, int x0_q4, 311 int x_step_q4, int y0_q4, int y_step_q4, 312 int w, int h) { 313 const int16x8_t filters = vld1q_s16(filter[x0_q4]); 314 uint8x8_t t0, t1, t2, t3; 315 316 assert(!((intptr_t)dst & 3)); 317 assert(!(dst_stride & 3)); 318 assert(x_step_q4 == 16); 319 320 (void)x_step_q4; 321 (void)y0_q4; 322 (void)y_step_q4; 323 324 src -= 3; 325 326 if (h == 4) { 327 uint8x8_t d01, d23; 328 int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, 329 d1, d2, d3; 330 int16x8_t tt0, tt1, tt2, tt3; 331 uint32x4_t d0123 = vdupq_n_u32(0); 332 333 __builtin_prefetch(src + 0 * src_stride); 334 __builtin_prefetch(src + 1 * src_stride); 335 __builtin_prefetch(src + 2 * src_stride); 336 __builtin_prefetch(src + 3 * src_stride); 337 filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 338 filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 339 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 340 transpose_u8_8x4(&t0, &t1, &t2, &t3); 341 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 342 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 343 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 344 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 345 s0 = vget_low_s16(tt0); 346 s1 = vget_low_s16(tt1); 347 s2 = vget_low_s16(tt2); 348 s3 = vget_low_s16(tt3); 349 s4 = vget_high_s16(tt0); 350 s5 = vget_high_s16(tt1); 351 s6 = vget_high_s16(tt2); 352 __builtin_prefetch(dst + 0 * dst_stride); 353 __builtin_prefetch(dst + 1 * dst_stride); 354 __builtin_prefetch(dst + 2 * dst_stride); 355 __builtin_prefetch(dst + 3 * dst_stride); 356 src += 7; 357 358 do { 359 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 360 transpose_u8_8x4(&t0, &t1, &t2, &t3); 361 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 362 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 363 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 364 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 365 s7 = vget_low_s16(tt0); 366 s8 = vget_low_s16(tt1); 367 s9 = vget_low_s16(tt2); 368 s10 = vget_low_s16(tt3); 369 370 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 371 filter4); 372 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 373 filter4); 374 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 375 filter4); 376 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 377 filter4); 378 379 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 380 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 381 transpose_u8_4x4(&d01, &d23); 382 383 d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); 384 d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); 385 d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); 386 d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); 387 d0123 = vreinterpretq_u32_u8( 388 vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); 389 390 vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); 391 vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); 392 vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); 393 vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); 394 395 s0 = s4; 396 s1 = s5; 397 s2 = s6; 398 s3 = s7; 399 s4 = s8; 400 s5 = s9; 401 s6 = s10; 402 src += 4; 403 dst += 4; 404 w -= 4; 405 } while (w > 0); 406 } else { 407 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 408 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 409 int width; 410 const uint8_t *s; 411 uint8x8_t t4, t5, t6, t7; 412 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 413 414 if (w == 4) { 415 uint32x4_t d0415 = vdupq_n_u32(0); 416 uint32x4_t d2637 = vdupq_n_u32(0); 417 do { 418 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 419 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 420 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 421 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 422 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 423 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 424 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 425 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 426 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 427 428 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, 429 &t7); 430 src += 8 * src_stride; 431 __builtin_prefetch(dst + 0 * dst_stride); 432 __builtin_prefetch(dst + 1 * dst_stride); 433 __builtin_prefetch(dst + 2 * dst_stride); 434 __builtin_prefetch(dst + 3 * dst_stride); 435 __builtin_prefetch(dst + 4 * dst_stride); 436 __builtin_prefetch(dst + 5 * dst_stride); 437 __builtin_prefetch(dst + 6 * dst_stride); 438 __builtin_prefetch(dst + 7 * dst_stride); 439 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); 440 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 441 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 442 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 443 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 444 445 __builtin_prefetch(src + 0 * src_stride); 446 __builtin_prefetch(src + 1 * src_stride); 447 __builtin_prefetch(src + 2 * src_stride); 448 __builtin_prefetch(src + 3 * src_stride); 449 __builtin_prefetch(src + 4 * src_stride); 450 __builtin_prefetch(src + 5 * src_stride); 451 __builtin_prefetch(src + 6 * src_stride); 452 __builtin_prefetch(src + 7 * src_stride); 453 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 454 filter4); 455 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 456 filter4); 457 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 458 filter4); 459 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 460 filter4); 461 462 transpose_u8_8x4(&t0, &t1, &t2, &t3); 463 464 d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); 465 d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2); 466 d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0); 467 d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2); 468 d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1); 469 d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3); 470 d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1); 471 d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3); 472 d0415 = vreinterpretq_u32_u8( 473 vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1))); 474 d2637 = vreinterpretq_u32_u8( 475 vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3))); 476 477 vst1q_lane_u32((uint32_t *)dst, d0415, 0); 478 dst += dst_stride; 479 vst1q_lane_u32((uint32_t *)dst, d0415, 2); 480 dst += dst_stride; 481 vst1q_lane_u32((uint32_t *)dst, d2637, 0); 482 dst += dst_stride; 483 vst1q_lane_u32((uint32_t *)dst, d2637, 2); 484 dst += dst_stride; 485 vst1q_lane_u32((uint32_t *)dst, d0415, 1); 486 dst += dst_stride; 487 vst1q_lane_u32((uint32_t *)dst, d0415, 3); 488 dst += dst_stride; 489 vst1q_lane_u32((uint32_t *)dst, d2637, 1); 490 dst += dst_stride; 491 vst1q_lane_u32((uint32_t *)dst, d2637, 3); 492 dst += dst_stride; 493 h -= 8; 494 } while (h > 0); 495 } else { 496 uint8_t *d; 497 int16x8_t s11, s12, s13, s14; 498 uint8x16_t d01, d23, d45, d67; 499 500 do { 501 __builtin_prefetch(src + 0 * src_stride); 502 __builtin_prefetch(src + 1 * src_stride); 503 __builtin_prefetch(src + 2 * src_stride); 504 __builtin_prefetch(src + 3 * src_stride); 505 __builtin_prefetch(src + 4 * src_stride); 506 __builtin_prefetch(src + 5 * src_stride); 507 __builtin_prefetch(src + 6 * src_stride); 508 __builtin_prefetch(src + 7 * src_stride); 509 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 510 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 511 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 512 s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 513 s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 514 s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 515 s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 516 s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 517 s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 518 519 width = w; 520 s = src + 7; 521 d = dst; 522 __builtin_prefetch(dst + 0 * dst_stride); 523 __builtin_prefetch(dst + 1 * dst_stride); 524 __builtin_prefetch(dst + 2 * dst_stride); 525 __builtin_prefetch(dst + 3 * dst_stride); 526 __builtin_prefetch(dst + 4 * dst_stride); 527 __builtin_prefetch(dst + 5 * dst_stride); 528 __builtin_prefetch(dst + 6 * dst_stride); 529 __builtin_prefetch(dst + 7 * dst_stride); 530 531 do { 532 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 533 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 534 s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 535 s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 536 s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 537 s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 538 s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 539 s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 540 s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 541 s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 542 543 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 544 filter4); 545 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 546 filter4); 547 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 548 filter4); 549 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 550 filter4); 551 t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, 552 filter4); 553 t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, 554 filter4); 555 t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, 556 filter4); 557 t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, 558 filter3, filter4); 559 560 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 561 562 d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), 563 vld1_u8(d + 1 * dst_stride)); 564 d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), 565 vld1_u8(d + 3 * dst_stride)); 566 d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride), 567 vld1_u8(d + 5 * dst_stride)); 568 d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride), 569 vld1_u8(d + 7 * dst_stride)); 570 d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1)); 571 d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3)); 572 d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); 573 d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); 574 575 store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), 576 vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), 577 vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); 578 579 s0 = s8; 580 s1 = s9; 581 s2 = s10; 582 s3 = s11; 583 s4 = s12; 584 s5 = s13; 585 s6 = s14; 586 s += 8; 587 d += 8; 588 width -= 8; 589 } while (width > 0); 590 src += 8 * src_stride; 591 dst += 8 * dst_stride; 592 h -= 8; 593 } while (h > 0); 594 } 595 } 596 } 597 598 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, 599 uint8_t *dst, ptrdiff_t dst_stride, 600 const InterpKernel *filter, int x0_q4, 601 int x_step_q4, int y0_q4, int y_step_q4, int w, 602 int h) { 603 const int16x8_t filters = vld1q_s16(filter[y0_q4]); 604 605 assert(!((intptr_t)dst & 3)); 606 assert(!(dst_stride & 3)); 607 assert(y_step_q4 == 16); 608 609 (void)x0_q4; 610 (void)x_step_q4; 611 (void)y_step_q4; 612 613 src -= 3 * src_stride; 614 615 if (w == 4) { 616 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 617 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 618 uint8x8_t d01, d23; 619 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 620 621 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 622 src += src_stride; 623 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 624 src += src_stride; 625 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 626 src += src_stride; 627 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 628 src += src_stride; 629 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 630 src += src_stride; 631 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 632 src += src_stride; 633 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 634 src += src_stride; 635 636 do { 637 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 638 src += src_stride; 639 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 640 src += src_stride; 641 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 642 src += src_stride; 643 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 644 src += src_stride; 645 646 __builtin_prefetch(dst + 0 * dst_stride); 647 __builtin_prefetch(dst + 1 * dst_stride); 648 __builtin_prefetch(dst + 2 * dst_stride); 649 __builtin_prefetch(dst + 3 * dst_stride); 650 __builtin_prefetch(src + 0 * src_stride); 651 __builtin_prefetch(src + 1 * src_stride); 652 __builtin_prefetch(src + 2 * src_stride); 653 __builtin_prefetch(src + 3 * src_stride); 654 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 655 filter4); 656 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 657 filter4); 658 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 659 filter4); 660 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 661 filter4); 662 663 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 664 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 665 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); 666 dst += dst_stride; 667 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); 668 dst += dst_stride; 669 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); 670 dst += dst_stride; 671 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); 672 dst += dst_stride; 673 674 s0 = s4; 675 s1 = s5; 676 s2 = s6; 677 s3 = s7; 678 s4 = s8; 679 s5 = s9; 680 s6 = s10; 681 h -= 4; 682 } while (h > 0); 683 } else { 684 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 685 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 686 int height; 687 const uint8_t *s; 688 uint8_t *d; 689 uint8x8_t t0, t1, t2, t3; 690 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 691 692 do { 693 __builtin_prefetch(src + 0 * src_stride); 694 __builtin_prefetch(src + 1 * src_stride); 695 __builtin_prefetch(src + 2 * src_stride); 696 __builtin_prefetch(src + 3 * src_stride); 697 __builtin_prefetch(src + 4 * src_stride); 698 __builtin_prefetch(src + 5 * src_stride); 699 __builtin_prefetch(src + 6 * src_stride); 700 s = src; 701 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 702 s += src_stride; 703 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 704 s += src_stride; 705 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 706 s += src_stride; 707 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 708 s += src_stride; 709 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 710 s += src_stride; 711 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 712 s += src_stride; 713 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 714 s += src_stride; 715 d = dst; 716 height = h; 717 718 do { 719 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 720 s += src_stride; 721 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 722 s += src_stride; 723 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 724 s += src_stride; 725 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 726 s += src_stride; 727 728 __builtin_prefetch(d + 0 * dst_stride); 729 __builtin_prefetch(d + 1 * dst_stride); 730 __builtin_prefetch(d + 2 * dst_stride); 731 __builtin_prefetch(d + 3 * dst_stride); 732 __builtin_prefetch(s + 0 * src_stride); 733 __builtin_prefetch(s + 1 * src_stride); 734 __builtin_prefetch(s + 2 * src_stride); 735 __builtin_prefetch(s + 3 * src_stride); 736 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 737 filter4); 738 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 739 filter4); 740 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 741 filter4); 742 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 743 filter4); 744 745 vst1_u8(d, t0); 746 d += dst_stride; 747 vst1_u8(d, t1); 748 d += dst_stride; 749 vst1_u8(d, t2); 750 d += dst_stride; 751 vst1_u8(d, t3); 752 d += dst_stride; 753 754 s0 = s4; 755 s1 = s5; 756 s2 = s6; 757 s3 = s7; 758 s4 = s8; 759 s5 = s9; 760 s6 = s10; 761 height -= 4; 762 } while (height > 0); 763 src += 8; 764 dst += 8; 765 w -= 8; 766 } while (w > 0); 767 } 768 } 769 770 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, 771 uint8_t *dst, ptrdiff_t dst_stride, 772 const InterpKernel *filter, int x0_q4, 773 int x_step_q4, int y0_q4, int y_step_q4, int w, 774 int h) { 775 const int16x8_t filters = vld1q_s16(filter[y0_q4]); 776 777 assert(!((intptr_t)dst & 3)); 778 assert(!(dst_stride & 3)); 779 assert(y_step_q4 == 16); 780 781 (void)x0_q4; 782 (void)x_step_q4; 783 (void)y_step_q4; 784 785 src -= 3 * src_stride; 786 787 if (w == 4) { 788 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 789 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 790 uint8x8_t d01, d23; 791 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; 792 uint32x4_t d0123 = vdupq_n_u32(0); 793 794 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 795 src += src_stride; 796 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 797 src += src_stride; 798 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 799 src += src_stride; 800 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 801 src += src_stride; 802 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 803 src += src_stride; 804 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 805 src += src_stride; 806 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 807 src += src_stride; 808 809 do { 810 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 811 src += src_stride; 812 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 813 src += src_stride; 814 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 815 src += src_stride; 816 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); 817 src += src_stride; 818 819 __builtin_prefetch(dst + 0 * dst_stride); 820 __builtin_prefetch(dst + 1 * dst_stride); 821 __builtin_prefetch(dst + 2 * dst_stride); 822 __builtin_prefetch(dst + 3 * dst_stride); 823 __builtin_prefetch(src + 0 * src_stride); 824 __builtin_prefetch(src + 1 * src_stride); 825 __builtin_prefetch(src + 2 * src_stride); 826 __builtin_prefetch(src + 3 * src_stride); 827 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 828 filter4); 829 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 830 filter4); 831 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 832 filter4); 833 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 834 filter4); 835 836 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); 837 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); 838 839 d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); 840 d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1); 841 d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2); 842 d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); 843 d0123 = vreinterpretq_u32_u8( 844 vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); 845 846 vst1q_lane_u32((uint32_t *)dst, d0123, 0); 847 dst += dst_stride; 848 vst1q_lane_u32((uint32_t *)dst, d0123, 1); 849 dst += dst_stride; 850 vst1q_lane_u32((uint32_t *)dst, d0123, 2); 851 dst += dst_stride; 852 vst1q_lane_u32((uint32_t *)dst, d0123, 3); 853 dst += dst_stride; 854 855 s0 = s4; 856 s1 = s5; 857 s2 = s6; 858 s3 = s7; 859 s4 = s8; 860 s5 = s9; 861 s6 = s10; 862 h -= 4; 863 } while (h > 0); 864 } else { 865 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); 866 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); 867 int height; 868 const uint8_t *s; 869 uint8_t *d; 870 uint8x8_t t0, t1, t2, t3; 871 uint8x16_t d01, d23, dd01, dd23; 872 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 873 874 do { 875 __builtin_prefetch(src + 0 * src_stride); 876 __builtin_prefetch(src + 1 * src_stride); 877 __builtin_prefetch(src + 2 * src_stride); 878 __builtin_prefetch(src + 3 * src_stride); 879 __builtin_prefetch(src + 4 * src_stride); 880 __builtin_prefetch(src + 5 * src_stride); 881 __builtin_prefetch(src + 6 * src_stride); 882 s = src; 883 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 884 s += src_stride; 885 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 886 s += src_stride; 887 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 888 s += src_stride; 889 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 890 s += src_stride; 891 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 892 s += src_stride; 893 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 894 s += src_stride; 895 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 896 s += src_stride; 897 d = dst; 898 height = h; 899 900 do { 901 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 902 s += src_stride; 903 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 904 s += src_stride; 905 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 906 s += src_stride; 907 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 908 s += src_stride; 909 910 __builtin_prefetch(d + 0 * dst_stride); 911 __builtin_prefetch(d + 1 * dst_stride); 912 __builtin_prefetch(d + 2 * dst_stride); 913 __builtin_prefetch(d + 3 * dst_stride); 914 __builtin_prefetch(s + 0 * src_stride); 915 __builtin_prefetch(s + 1 * src_stride); 916 __builtin_prefetch(s + 2 * src_stride); 917 __builtin_prefetch(s + 3 * src_stride); 918 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, 919 filter4); 920 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, 921 filter4); 922 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, 923 filter4); 924 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, 925 filter4); 926 927 d01 = vcombine_u8(t0, t1); 928 d23 = vcombine_u8(t2, t3); 929 dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), 930 vld1_u8(d + 1 * dst_stride)); 931 dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), 932 vld1_u8(d + 3 * dst_stride)); 933 dd01 = vrhaddq_u8(dd01, d01); 934 dd23 = vrhaddq_u8(dd23, d23); 935 936 vst1_u8(d, vget_low_u8(dd01)); 937 d += dst_stride; 938 vst1_u8(d, vget_high_u8(dd01)); 939 d += dst_stride; 940 vst1_u8(d, vget_low_u8(dd23)); 941 d += dst_stride; 942 vst1_u8(d, vget_high_u8(dd23)); 943 d += dst_stride; 944 945 s0 = s4; 946 s1 = s5; 947 s2 = s6; 948 s3 = s7; 949 s4 = s8; 950 s5 = s9; 951 s6 = s10; 952 height -= 4; 953 } while (height > 0); 954 src += 8; 955 dst += 8; 956 w -= 8; 957 } while (w > 0); 958 } 959 } 960