1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include "./vpx_config.h" 13 #include "vpx_ports/arm.h" 14 15 static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit, // flimit 16 uint8x16_t qlimit, // limit 17 uint8x16_t qthresh, // thresh 18 uint8x16_t q3, // p3 19 uint8x16_t q4, // p2 20 uint8x16_t q5, // p1 21 uint8x16_t q6, // p0 22 uint8x16_t q7, // q0 23 uint8x16_t q8, // q1 24 uint8x16_t q9, // q2 25 uint8x16_t q10, // q3 26 uint8x16_t *q5r, // p1 27 uint8x16_t *q6r, // p0 28 uint8x16_t *q7r, // q0 29 uint8x16_t *q8r) { // q1 30 uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; 31 int16x8_t q2s16, q11s16; 32 uint16x8_t q4u16; 33 int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; 34 int8x8_t d2s8, d3s8; 35 36 q11u8 = vabdq_u8(q3, q4); 37 q12u8 = vabdq_u8(q4, q5); 38 q13u8 = vabdq_u8(q5, q6); 39 q14u8 = vabdq_u8(q8, q7); 40 q3 = vabdq_u8(q9, q8); 41 q4 = vabdq_u8(q10, q9); 42 43 q11u8 = vmaxq_u8(q11u8, q12u8); 44 q12u8 = vmaxq_u8(q13u8, q14u8); 45 q3 = vmaxq_u8(q3, q4); 46 q15u8 = vmaxq_u8(q11u8, q12u8); 47 48 q9 = vabdq_u8(q6, q7); 49 50 // vp8_hevmask 51 q13u8 = vcgtq_u8(q13u8, qthresh); 52 q14u8 = vcgtq_u8(q14u8, qthresh); 53 q15u8 = vmaxq_u8(q15u8, q3); 54 55 q2u8 = vabdq_u8(q5, q8); 56 q9 = vqaddq_u8(q9, q9); 57 58 q15u8 = vcgeq_u8(qlimit, q15u8); 59 60 // vp8_filter() function 61 // convert to signed 62 q10 = vdupq_n_u8(0x80); 63 q8 = veorq_u8(q8, q10); 64 q7 = veorq_u8(q7, q10); 65 q6 = veorq_u8(q6, q10); 66 q5 = veorq_u8(q5, q10); 67 68 q2u8 = vshrq_n_u8(q2u8, 1); 69 q9 = vqaddq_u8(q9, q2u8); 70 71 q10 = vdupq_n_u8(3); 72 73 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), 74 vget_low_s8(vreinterpretq_s8_u8(q6))); 75 q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), 76 vget_high_s8(vreinterpretq_s8_u8(q6))); 77 78 q9 = vcgeq_u8(qblimit, q9); 79 80 q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); 81 82 q14u8 = vorrq_u8(q13u8, q14u8); 83 84 q4u16 = vmovl_u8(vget_low_u8(q10)); 85 q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); 86 q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); 87 88 q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); 89 q15u8 = vandq_u8(q15u8, q9); 90 91 q1s8 = vreinterpretq_s8_u8(q1u8); 92 q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); 93 q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); 94 95 q9 = vdupq_n_u8(4); 96 // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) 97 d2s8 = vqmovn_s16(q2s16); 98 d3s8 = vqmovn_s16(q11s16); 99 q1s8 = vcombine_s8(d2s8, d3s8); 100 q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); 101 q1s8 = vreinterpretq_s8_u8(q1u8); 102 103 q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); 104 q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); 105 q2s8 = vshrq_n_s8(q2s8, 3); 106 q1s8 = vshrq_n_s8(q1s8, 3); 107 108 q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); 109 q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); 110 111 q1s8 = vrshrq_n_s8(q1s8, 1); 112 q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); 113 114 q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); 115 q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); 116 117 q0u8 = vdupq_n_u8(0x80); 118 *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); 119 *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); 120 *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); 121 *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); 122 return; 123 } 124 125 void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, 126 unsigned char blimit, 127 unsigned char limit, 128 unsigned char thresh) { 129 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 130 uint8x16_t q5, q6, q7, q8, q9, q10; 131 132 qblimit = vdupq_n_u8(blimit); 133 qlimit = vdupq_n_u8(limit); 134 qthresh = vdupq_n_u8(thresh); 135 src -= (pitch << 2); 136 137 q3 = vld1q_u8(src); 138 src += pitch; 139 q4 = vld1q_u8(src); 140 src += pitch; 141 q5 = vld1q_u8(src); 142 src += pitch; 143 q6 = vld1q_u8(src); 144 src += pitch; 145 q7 = vld1q_u8(src); 146 src += pitch; 147 q8 = vld1q_u8(src); 148 src += pitch; 149 q9 = vld1q_u8(src); 150 src += pitch; 151 q10 = vld1q_u8(src); 152 153 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 154 q10, &q5, &q6, &q7, &q8); 155 156 src -= (pitch * 5); 157 vst1q_u8(src, q5); 158 src += pitch; 159 vst1q_u8(src, q6); 160 src += pitch; 161 vst1q_u8(src, q7); 162 src += pitch; 163 vst1q_u8(src, q8); 164 return; 165 } 166 167 void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, 168 unsigned char blimit, 169 unsigned char limit, 170 unsigned char thresh, 171 unsigned char *v) { 172 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 173 uint8x16_t q5, q6, q7, q8, q9, q10; 174 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; 175 uint8x8_t d15, d16, d17, d18, d19, d20, d21; 176 177 qblimit = vdupq_n_u8(blimit); 178 qlimit = vdupq_n_u8(limit); 179 qthresh = vdupq_n_u8(thresh); 180 181 u -= (pitch << 2); 182 v -= (pitch << 2); 183 184 d6 = vld1_u8(u); 185 u += pitch; 186 d7 = vld1_u8(v); 187 v += pitch; 188 d8 = vld1_u8(u); 189 u += pitch; 190 d9 = vld1_u8(v); 191 v += pitch; 192 d10 = vld1_u8(u); 193 u += pitch; 194 d11 = vld1_u8(v); 195 v += pitch; 196 d12 = vld1_u8(u); 197 u += pitch; 198 d13 = vld1_u8(v); 199 v += pitch; 200 d14 = vld1_u8(u); 201 u += pitch; 202 d15 = vld1_u8(v); 203 v += pitch; 204 d16 = vld1_u8(u); 205 u += pitch; 206 d17 = vld1_u8(v); 207 v += pitch; 208 d18 = vld1_u8(u); 209 u += pitch; 210 d19 = vld1_u8(v); 211 v += pitch; 212 d20 = vld1_u8(u); 213 d21 = vld1_u8(v); 214 215 q3 = vcombine_u8(d6, d7); 216 q4 = vcombine_u8(d8, d9); 217 q5 = vcombine_u8(d10, d11); 218 q6 = vcombine_u8(d12, d13); 219 q7 = vcombine_u8(d14, d15); 220 q8 = vcombine_u8(d16, d17); 221 q9 = vcombine_u8(d18, d19); 222 q10 = vcombine_u8(d20, d21); 223 224 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 225 q10, &q5, &q6, &q7, &q8); 226 227 u -= (pitch * 5); 228 vst1_u8(u, vget_low_u8(q5)); 229 u += pitch; 230 vst1_u8(u, vget_low_u8(q6)); 231 u += pitch; 232 vst1_u8(u, vget_low_u8(q7)); 233 u += pitch; 234 vst1_u8(u, vget_low_u8(q8)); 235 236 v -= (pitch * 5); 237 vst1_u8(v, vget_high_u8(q5)); 238 v += pitch; 239 vst1_u8(v, vget_high_u8(q6)); 240 v += pitch; 241 vst1_u8(v, vget_high_u8(q7)); 242 v += pitch; 243 vst1_u8(v, vget_high_u8(q8)); 244 return; 245 } 246 247 static INLINE void write_4x8(unsigned char *dst, int pitch, 248 const uint8x8x4_t result) { 249 #ifdef VPX_INCOMPATIBLE_GCC 250 /* 251 * uint8x8x4_t result 252 00 01 02 03 | 04 05 06 07 253 10 11 12 13 | 14 15 16 17 254 20 21 22 23 | 24 25 26 27 255 30 31 32 33 | 34 35 36 37 256 --- 257 * after vtrn_u16 258 00 01 20 21 | 04 05 24 25 259 02 03 22 23 | 06 07 26 27 260 10 11 30 31 | 14 15 34 35 261 12 13 32 33 | 16 17 36 37 262 --- 263 * after vtrn_u8 264 00 10 20 30 | 04 14 24 34 265 01 11 21 31 | 05 15 25 35 266 02 12 22 32 | 06 16 26 36 267 03 13 23 33 | 07 17 27 37 268 */ 269 const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), 270 vreinterpret_u16_u8(result.val[2])); 271 const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), 272 vreinterpret_u16_u8(result.val[3])); 273 const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), 274 vreinterpret_u8_u16(r13_u16.val[0])); 275 const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), 276 vreinterpret_u8_u16(r13_u16.val[1])); 277 const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); 278 const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); 279 const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); 280 const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); 281 vst1_lane_u32((uint32_t *)dst, x_0_4, 0); 282 dst += pitch; 283 vst1_lane_u32((uint32_t *)dst, x_1_5, 0); 284 dst += pitch; 285 vst1_lane_u32((uint32_t *)dst, x_2_6, 0); 286 dst += pitch; 287 vst1_lane_u32((uint32_t *)dst, x_3_7, 0); 288 dst += pitch; 289 vst1_lane_u32((uint32_t *)dst, x_0_4, 1); 290 dst += pitch; 291 vst1_lane_u32((uint32_t *)dst, x_1_5, 1); 292 dst += pitch; 293 vst1_lane_u32((uint32_t *)dst, x_2_6, 1); 294 dst += pitch; 295 vst1_lane_u32((uint32_t *)dst, x_3_7, 1); 296 #else 297 vst4_lane_u8(dst, result, 0); 298 dst += pitch; 299 vst4_lane_u8(dst, result, 1); 300 dst += pitch; 301 vst4_lane_u8(dst, result, 2); 302 dst += pitch; 303 vst4_lane_u8(dst, result, 3); 304 dst += pitch; 305 vst4_lane_u8(dst, result, 4); 306 dst += pitch; 307 vst4_lane_u8(dst, result, 5); 308 dst += pitch; 309 vst4_lane_u8(dst, result, 6); 310 dst += pitch; 311 vst4_lane_u8(dst, result, 7); 312 #endif // VPX_INCOMPATIBLE_GCC 313 } 314 315 void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, 316 unsigned char blimit, 317 unsigned char limit, 318 unsigned char thresh) { 319 unsigned char *s, *d; 320 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 321 uint8x16_t q5, q6, q7, q8, q9, q10; 322 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; 323 uint8x8_t d15, d16, d17, d18, d19, d20, d21; 324 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; 325 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; 326 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; 327 uint8x8x4_t q4ResultH, q4ResultL; 328 329 qblimit = vdupq_n_u8(blimit); 330 qlimit = vdupq_n_u8(limit); 331 qthresh = vdupq_n_u8(thresh); 332 333 s = src - 4; 334 d6 = vld1_u8(s); 335 s += pitch; 336 d8 = vld1_u8(s); 337 s += pitch; 338 d10 = vld1_u8(s); 339 s += pitch; 340 d12 = vld1_u8(s); 341 s += pitch; 342 d14 = vld1_u8(s); 343 s += pitch; 344 d16 = vld1_u8(s); 345 s += pitch; 346 d18 = vld1_u8(s); 347 s += pitch; 348 d20 = vld1_u8(s); 349 s += pitch; 350 d7 = vld1_u8(s); 351 s += pitch; 352 d9 = vld1_u8(s); 353 s += pitch; 354 d11 = vld1_u8(s); 355 s += pitch; 356 d13 = vld1_u8(s); 357 s += pitch; 358 d15 = vld1_u8(s); 359 s += pitch; 360 d17 = vld1_u8(s); 361 s += pitch; 362 d19 = vld1_u8(s); 363 s += pitch; 364 d21 = vld1_u8(s); 365 366 q3 = vcombine_u8(d6, d7); 367 q4 = vcombine_u8(d8, d9); 368 q5 = vcombine_u8(d10, d11); 369 q6 = vcombine_u8(d12, d13); 370 q7 = vcombine_u8(d14, d15); 371 q8 = vcombine_u8(d16, d17); 372 q9 = vcombine_u8(d18, d19); 373 q10 = vcombine_u8(d20, d21); 374 375 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); 376 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); 377 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); 378 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); 379 380 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), 381 vreinterpretq_u16_u32(q2tmp2.val[0])); 382 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), 383 vreinterpretq_u16_u32(q2tmp3.val[0])); 384 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), 385 vreinterpretq_u16_u32(q2tmp2.val[1])); 386 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), 387 vreinterpretq_u16_u32(q2tmp3.val[1])); 388 389 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), 390 vreinterpretq_u8_u16(q2tmp5.val[0])); 391 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), 392 vreinterpretq_u8_u16(q2tmp5.val[1])); 393 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), 394 vreinterpretq_u8_u16(q2tmp7.val[0])); 395 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), 396 vreinterpretq_u8_u16(q2tmp7.val[1])); 397 398 q3 = q2tmp8.val[0]; 399 q4 = q2tmp8.val[1]; 400 q5 = q2tmp9.val[0]; 401 q6 = q2tmp9.val[1]; 402 q7 = q2tmp10.val[0]; 403 q8 = q2tmp10.val[1]; 404 q9 = q2tmp11.val[0]; 405 q10 = q2tmp11.val[1]; 406 407 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 408 q10, &q5, &q6, &q7, &q8); 409 410 q4ResultL.val[0] = vget_low_u8(q5); // d10 411 q4ResultL.val[1] = vget_low_u8(q6); // d12 412 q4ResultL.val[2] = vget_low_u8(q7); // d14 413 q4ResultL.val[3] = vget_low_u8(q8); // d16 414 q4ResultH.val[0] = vget_high_u8(q5); // d11 415 q4ResultH.val[1] = vget_high_u8(q6); // d13 416 q4ResultH.val[2] = vget_high_u8(q7); // d15 417 q4ResultH.val[3] = vget_high_u8(q8); // d17 418 419 d = src - 2; 420 write_4x8(d, pitch, q4ResultL); 421 d += pitch * 8; 422 write_4x8(d, pitch, q4ResultH); 423 } 424 425 void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, 426 unsigned char blimit, 427 unsigned char limit, 428 unsigned char thresh, 429 unsigned char *v) { 430 unsigned char *us, *ud; 431 unsigned char *vs, *vd; 432 uint8x16_t qblimit, qlimit, qthresh, q3, q4; 433 uint8x16_t q5, q6, q7, q8, q9, q10; 434 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; 435 uint8x8_t d15, d16, d17, d18, d19, d20, d21; 436 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; 437 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; 438 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; 439 uint8x8x4_t q4ResultH, q4ResultL; 440 441 qblimit = vdupq_n_u8(blimit); 442 qlimit = vdupq_n_u8(limit); 443 qthresh = vdupq_n_u8(thresh); 444 445 us = u - 4; 446 d6 = vld1_u8(us); 447 us += pitch; 448 d8 = vld1_u8(us); 449 us += pitch; 450 d10 = vld1_u8(us); 451 us += pitch; 452 d12 = vld1_u8(us); 453 us += pitch; 454 d14 = vld1_u8(us); 455 us += pitch; 456 d16 = vld1_u8(us); 457 us += pitch; 458 d18 = vld1_u8(us); 459 us += pitch; 460 d20 = vld1_u8(us); 461 462 vs = v - 4; 463 d7 = vld1_u8(vs); 464 vs += pitch; 465 d9 = vld1_u8(vs); 466 vs += pitch; 467 d11 = vld1_u8(vs); 468 vs += pitch; 469 d13 = vld1_u8(vs); 470 vs += pitch; 471 d15 = vld1_u8(vs); 472 vs += pitch; 473 d17 = vld1_u8(vs); 474 vs += pitch; 475 d19 = vld1_u8(vs); 476 vs += pitch; 477 d21 = vld1_u8(vs); 478 479 q3 = vcombine_u8(d6, d7); 480 q4 = vcombine_u8(d8, d9); 481 q5 = vcombine_u8(d10, d11); 482 q6 = vcombine_u8(d12, d13); 483 q7 = vcombine_u8(d14, d15); 484 q8 = vcombine_u8(d16, d17); 485 q9 = vcombine_u8(d18, d19); 486 q10 = vcombine_u8(d20, d21); 487 488 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); 489 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); 490 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); 491 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); 492 493 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), 494 vreinterpretq_u16_u32(q2tmp2.val[0])); 495 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), 496 vreinterpretq_u16_u32(q2tmp3.val[0])); 497 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), 498 vreinterpretq_u16_u32(q2tmp2.val[1])); 499 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), 500 vreinterpretq_u16_u32(q2tmp3.val[1])); 501 502 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), 503 vreinterpretq_u8_u16(q2tmp5.val[0])); 504 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), 505 vreinterpretq_u8_u16(q2tmp5.val[1])); 506 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), 507 vreinterpretq_u8_u16(q2tmp7.val[0])); 508 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), 509 vreinterpretq_u8_u16(q2tmp7.val[1])); 510 511 q3 = q2tmp8.val[0]; 512 q4 = q2tmp8.val[1]; 513 q5 = q2tmp9.val[0]; 514 q6 = q2tmp9.val[1]; 515 q7 = q2tmp10.val[0]; 516 q8 = q2tmp10.val[1]; 517 q9 = q2tmp11.val[0]; 518 q10 = q2tmp11.val[1]; 519 520 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, 521 q10, &q5, &q6, &q7, &q8); 522 523 q4ResultL.val[0] = vget_low_u8(q5); // d10 524 q4ResultL.val[1] = vget_low_u8(q6); // d12 525 q4ResultL.val[2] = vget_low_u8(q7); // d14 526 q4ResultL.val[3] = vget_low_u8(q8); // d16 527 ud = u - 2; 528 write_4x8(ud, pitch, q4ResultL); 529 530 q4ResultH.val[0] = vget_high_u8(q5); // d11 531 q4ResultH.val[1] = vget_high_u8(q6); // d13 532 q4ResultH.val[2] = vget_high_u8(q7); // d15 533 q4ResultH.val[3] = vget_high_u8(q8); // d17 534 vd = v - 2; 535 write_4x8(vd, pitch, q4ResultH); 536 } 537