1 // Copyright 2012 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // ARM NEON version of dsp functions and loop filtering. 11 // 12 // Authors: Somnath Banerjee (somnath (at) google.com) 13 // Johann Koenig (johannkoenig (at) google.com) 14 15 #include "src/dsp/dsp.h" 16 17 #if defined(WEBP_USE_NEON) 18 19 #include "src/dsp/neon.h" 20 #include "src/dec/vp8i_dec.h" 21 22 //------------------------------------------------------------------------------ 23 // NxM Loading functions 24 25 #if !defined(WORK_AROUND_GCC) 26 27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation 28 // (register alloc, probably). The variants somewhat mitigate the problem, but 29 // not quite. HFilter16i() remains problematic. 30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src, 31 int stride) { 32 const uint8x8_t zero = vdup_n_u8(0); 33 uint8x8x4_t out; 34 INIT_VECTOR4(out, zero, zero, zero, zero); 35 out = vld4_lane_u8(src + 0 * stride, out, 0); 36 out = vld4_lane_u8(src + 1 * stride, out, 1); 37 out = vld4_lane_u8(src + 2 * stride, out, 2); 38 out = vld4_lane_u8(src + 3 * stride, out, 3); 39 out = vld4_lane_u8(src + 4 * stride, out, 4); 40 out = vld4_lane_u8(src + 5 * stride, out, 5); 41 out = vld4_lane_u8(src + 6 * stride, out, 6); 42 out = vld4_lane_u8(src + 7 * stride, out, 7); 43 return out; 44 } 45 46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride, 47 uint8x16_t* const p1, 48 uint8x16_t* const p0, 49 uint8x16_t* const q0, 50 uint8x16_t* const q1) { 51 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7] 52 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15] 53 const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride); 54 const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride); 55 *p1 = vcombine_u8(row0.val[0], row8.val[0]); 56 *p0 = vcombine_u8(row0.val[1], row8.val[1]); 57 *q0 = vcombine_u8(row0.val[2], row8.val[2]); 58 *q1 = vcombine_u8(row0.val[3], row8.val[3]); 59 } 60 61 #else // WORK_AROUND_GCC 62 63 #define LOADQ_LANE_32b(VALUE, LANE) do { \ 64 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \ 65 src += stride; \ 66 } while (0) 67 68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride, 69 uint8x16_t* const p1, 70 uint8x16_t* const p0, 71 uint8x16_t* const q0, 72 uint8x16_t* const q1) { 73 const uint32x4_t zero = vdupq_n_u32(0); 74 uint32x4x4_t in; 75 INIT_VECTOR4(in, zero, zero, zero, zero); 76 src -= 2; 77 LOADQ_LANE_32b(in.val[0], 0); 78 LOADQ_LANE_32b(in.val[1], 0); 79 LOADQ_LANE_32b(in.val[2], 0); 80 LOADQ_LANE_32b(in.val[3], 0); 81 LOADQ_LANE_32b(in.val[0], 1); 82 LOADQ_LANE_32b(in.val[1], 1); 83 LOADQ_LANE_32b(in.val[2], 1); 84 LOADQ_LANE_32b(in.val[3], 1); 85 LOADQ_LANE_32b(in.val[0], 2); 86 LOADQ_LANE_32b(in.val[1], 2); 87 LOADQ_LANE_32b(in.val[2], 2); 88 LOADQ_LANE_32b(in.val[3], 2); 89 LOADQ_LANE_32b(in.val[0], 3); 90 LOADQ_LANE_32b(in.val[1], 3); 91 LOADQ_LANE_32b(in.val[2], 3); 92 LOADQ_LANE_32b(in.val[3], 3); 93 // Transpose four 4x4 parts: 94 { 95 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]), 96 vreinterpretq_u8_u32(in.val[1])); 97 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]), 98 vreinterpretq_u8_u32(in.val[3])); 99 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]), 100 vreinterpretq_u16_u8(row23.val[0])); 101 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]), 102 vreinterpretq_u16_u8(row23.val[1])); 103 *p1 = vreinterpretq_u8_u16(row02.val[0]); 104 *p0 = vreinterpretq_u8_u16(row13.val[0]); 105 *q0 = vreinterpretq_u8_u16(row02.val[1]); 106 *q1 = vreinterpretq_u8_u16(row13.val[1]); 107 } 108 } 109 #undef LOADQ_LANE_32b 110 111 #endif // !WORK_AROUND_GCC 112 113 static WEBP_INLINE void Load8x16_NEON( 114 const uint8_t* const src, int stride, 115 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, 116 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, 117 uint8x16_t* const q2, uint8x16_t* const q3) { 118 Load4x16_NEON(src - 2, stride, p3, p2, p1, p0); 119 Load4x16_NEON(src + 2, stride, q0, q1, q2, q3); 120 } 121 122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride, 123 uint8x16_t* const p1, 124 uint8x16_t* const p0, 125 uint8x16_t* const q0, 126 uint8x16_t* const q1) { 127 *p1 = vld1q_u8(src - 2 * stride); 128 *p0 = vld1q_u8(src - 1 * stride); 129 *q0 = vld1q_u8(src + 0 * stride); 130 *q1 = vld1q_u8(src + 1 * stride); 131 } 132 133 static WEBP_INLINE void Load16x8_NEON( 134 const uint8_t* const src, int stride, 135 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, 136 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, 137 uint8x16_t* const q2, uint8x16_t* const q3) { 138 Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0); 139 Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3); 140 } 141 142 static WEBP_INLINE void Load8x8x2_NEON( 143 const uint8_t* const u, const uint8_t* const v, int stride, 144 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, 145 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, 146 uint8x16_t* const q2, uint8x16_t* const q3) { 147 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination 148 // and the v-samples on the higher half. 149 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride)); 150 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride)); 151 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride)); 152 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride)); 153 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride)); 154 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride)); 155 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride)); 156 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride)); 157 } 158 159 #if !defined(WORK_AROUND_GCC) 160 161 #define LOAD_UV_8(ROW) \ 162 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride)) 163 164 static WEBP_INLINE void Load8x8x2T_NEON( 165 const uint8_t* const u, const uint8_t* const v, int stride, 166 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, 167 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, 168 uint8x16_t* const q2, uint8x16_t* const q3) { 169 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination 170 // and the v-samples on the higher half. 171 const uint8x16_t row0 = LOAD_UV_8(0); 172 const uint8x16_t row1 = LOAD_UV_8(1); 173 const uint8x16_t row2 = LOAD_UV_8(2); 174 const uint8x16_t row3 = LOAD_UV_8(3); 175 const uint8x16_t row4 = LOAD_UV_8(4); 176 const uint8x16_t row5 = LOAD_UV_8(5); 177 const uint8x16_t row6 = LOAD_UV_8(6); 178 const uint8x16_t row7 = LOAD_UV_8(7); 179 // Perform two side-by-side 8x8 transposes 180 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07 181 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ... 182 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ... 183 // u30 u31 u32 u33 u34 u35 u36 u37 | ... 184 // u40 u41 u42 u43 u44 u45 u46 u47 | ... 185 // u50 u51 u52 u53 u54 u55 u56 u57 | ... 186 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ... 187 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ... 188 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ... 189 // u01 u11 u03 u13 ... 190 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ... 191 // u21 u31 u23 u33 ... 192 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ... 193 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ... 194 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]), 195 vreinterpretq_u16_u8(row23.val[0])); 196 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]), 197 vreinterpretq_u16_u8(row23.val[1])); 198 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]), 199 vreinterpretq_u16_u8(row67.val[0])); 200 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]), 201 vreinterpretq_u16_u8(row67.val[1])); 202 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]), 203 vreinterpretq_u32_u16(row46.val[0])); 204 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]), 205 vreinterpretq_u32_u16(row46.val[1])); 206 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]), 207 vreinterpretq_u32_u16(row57.val[0])); 208 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]), 209 vreinterpretq_u32_u16(row57.val[1])); 210 *p3 = vreinterpretq_u8_u32(row04.val[0]); 211 *p2 = vreinterpretq_u8_u32(row15.val[0]); 212 *p1 = vreinterpretq_u8_u32(row26.val[0]); 213 *p0 = vreinterpretq_u8_u32(row37.val[0]); 214 *q0 = vreinterpretq_u8_u32(row04.val[1]); 215 *q1 = vreinterpretq_u8_u32(row15.val[1]); 216 *q2 = vreinterpretq_u8_u32(row26.val[1]); 217 *q3 = vreinterpretq_u8_u32(row37.val[1]); 218 } 219 #undef LOAD_UV_8 220 221 #endif // !WORK_AROUND_GCC 222 223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v, 224 uint8_t* const dst, int stride) { 225 vst2_lane_u8(dst + 0 * stride, v, 0); 226 vst2_lane_u8(dst + 1 * stride, v, 1); 227 vst2_lane_u8(dst + 2 * stride, v, 2); 228 vst2_lane_u8(dst + 3 * stride, v, 3); 229 vst2_lane_u8(dst + 4 * stride, v, 4); 230 vst2_lane_u8(dst + 5 * stride, v, 5); 231 vst2_lane_u8(dst + 6 * stride, v, 6); 232 vst2_lane_u8(dst + 7 * stride, v, 7); 233 } 234 235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0, 236 uint8_t* const dst, int stride) { 237 uint8x8x2_t lo, hi; 238 lo.val[0] = vget_low_u8(p0); 239 lo.val[1] = vget_low_u8(q0); 240 hi.val[0] = vget_high_u8(p0); 241 hi.val[1] = vget_high_u8(q0); 242 Store2x8_NEON(lo, dst - 1 + 0 * stride, stride); 243 Store2x8_NEON(hi, dst - 1 + 8 * stride, stride); 244 } 245 246 #if !defined(WORK_AROUND_GCC) 247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v, 248 uint8_t* const dst, int stride) { 249 vst4_lane_u8(dst + 0 * stride, v, 0); 250 vst4_lane_u8(dst + 1 * stride, v, 1); 251 vst4_lane_u8(dst + 2 * stride, v, 2); 252 vst4_lane_u8(dst + 3 * stride, v, 3); 253 vst4_lane_u8(dst + 4 * stride, v, 4); 254 vst4_lane_u8(dst + 5 * stride, v, 5); 255 vst4_lane_u8(dst + 6 * stride, v, 6); 256 vst4_lane_u8(dst + 7 * stride, v, 7); 257 } 258 259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0, 260 const uint8x16_t q0, const uint8x16_t q1, 261 uint8_t* const dst, int stride) { 262 uint8x8x4_t lo, hi; 263 INIT_VECTOR4(lo, 264 vget_low_u8(p1), vget_low_u8(p0), 265 vget_low_u8(q0), vget_low_u8(q1)); 266 INIT_VECTOR4(hi, 267 vget_high_u8(p1), vget_high_u8(p0), 268 vget_high_u8(q0), vget_high_u8(q1)); 269 Store4x8_NEON(lo, dst - 2 + 0 * stride, stride); 270 Store4x8_NEON(hi, dst - 2 + 8 * stride, stride); 271 } 272 #endif // !WORK_AROUND_GCC 273 274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0, 275 uint8_t* const dst, int stride) { 276 vst1q_u8(dst - stride, p0); 277 vst1q_u8(dst, q0); 278 } 279 280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0, 281 const uint8x16_t q0, const uint8x16_t q1, 282 uint8_t* const dst, int stride) { 283 Store16x2_NEON(p1, p0, dst - stride, stride); 284 Store16x2_NEON(q0, q1, dst + stride, stride); 285 } 286 287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0, 288 const uint8x16_t q0, 289 uint8_t* const u, uint8_t* const v, 290 int stride) { 291 // p0 and q0 contain the u+v samples packed in low/high halves. 292 vst1_u8(u - stride, vget_low_u8(p0)); 293 vst1_u8(u, vget_low_u8(q0)); 294 vst1_u8(v - stride, vget_high_u8(p0)); 295 vst1_u8(v, vget_high_u8(q0)); 296 } 297 298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1, 299 const uint8x16_t p0, 300 const uint8x16_t q0, 301 const uint8x16_t q1, 302 uint8_t* const u, uint8_t* const v, 303 int stride) { 304 // The p1...q1 registers contain the u+v samples packed in low/high halves. 305 Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride); 306 Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride); 307 } 308 309 #if !defined(WORK_AROUND_GCC) 310 311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \ 312 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \ 313 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \ 314 (DST) += stride; \ 315 } while (0) 316 317 static WEBP_INLINE void Store6x8x2_NEON( 318 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, 319 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, 320 uint8_t* u, uint8_t* v, int stride) { 321 uint8x8x3_t u0, u1, v0, v1; 322 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)); 323 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)); 324 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0)); 325 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2)); 326 STORE6_LANE(u, u0, u1, 0); 327 STORE6_LANE(u, u0, u1, 1); 328 STORE6_LANE(u, u0, u1, 2); 329 STORE6_LANE(u, u0, u1, 3); 330 STORE6_LANE(u, u0, u1, 4); 331 STORE6_LANE(u, u0, u1, 5); 332 STORE6_LANE(u, u0, u1, 6); 333 STORE6_LANE(u, u0, u1, 7); 334 STORE6_LANE(v, v0, v1, 0); 335 STORE6_LANE(v, v0, v1, 1); 336 STORE6_LANE(v, v0, v1, 2); 337 STORE6_LANE(v, v0, v1, 3); 338 STORE6_LANE(v, v0, v1, 4); 339 STORE6_LANE(v, v0, v1, 5); 340 STORE6_LANE(v, v0, v1, 6); 341 STORE6_LANE(v, v0, v1, 7); 342 } 343 #undef STORE6_LANE 344 345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1, 346 const uint8x16_t p0, 347 const uint8x16_t q0, 348 const uint8x16_t q1, 349 uint8_t* const u, uint8_t* const v, 350 int stride) { 351 uint8x8x4_t u0, v0; 352 INIT_VECTOR4(u0, 353 vget_low_u8(p1), vget_low_u8(p0), 354 vget_low_u8(q0), vget_low_u8(q1)); 355 INIT_VECTOR4(v0, 356 vget_high_u8(p1), vget_high_u8(p0), 357 vget_high_u8(q0), vget_high_u8(q1)); 358 vst4_lane_u8(u - 2 + 0 * stride, u0, 0); 359 vst4_lane_u8(u - 2 + 1 * stride, u0, 1); 360 vst4_lane_u8(u - 2 + 2 * stride, u0, 2); 361 vst4_lane_u8(u - 2 + 3 * stride, u0, 3); 362 vst4_lane_u8(u - 2 + 4 * stride, u0, 4); 363 vst4_lane_u8(u - 2 + 5 * stride, u0, 5); 364 vst4_lane_u8(u - 2 + 6 * stride, u0, 6); 365 vst4_lane_u8(u - 2 + 7 * stride, u0, 7); 366 vst4_lane_u8(v - 2 + 0 * stride, v0, 0); 367 vst4_lane_u8(v - 2 + 1 * stride, v0, 1); 368 vst4_lane_u8(v - 2 + 2 * stride, v0, 2); 369 vst4_lane_u8(v - 2 + 3 * stride, v0, 3); 370 vst4_lane_u8(v - 2 + 4 * stride, v0, 4); 371 vst4_lane_u8(v - 2 + 5 * stride, v0, 5); 372 vst4_lane_u8(v - 2 + 6 * stride, v0, 6); 373 vst4_lane_u8(v - 2 + 7 * stride, v0, 7); 374 } 375 376 #endif // !WORK_AROUND_GCC 377 378 // Zero extend 'v' to an int16x8_t. 379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) { 380 return vreinterpretq_s16_u16(vmovl_u8(v)); 381 } 382 383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 384 // to the corresponding rows of 'dst'. 385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, 386 const int16x8_t dst01, 387 const int16x8_t dst23) { 388 // Unsigned saturate to 8b. 389 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 390 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 391 392 // Store the results. 393 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); 394 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); 395 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); 396 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); 397 } 398 399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, 400 const int16x8_t row23, 401 uint8_t* const dst) { 402 uint32x2_t dst01 = vdup_n_u32(0); 403 uint32x2_t dst23 = vdup_n_u32(0); 404 405 // Load the source pixels. 406 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); 407 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); 408 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); 409 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); 410 411 { 412 // Convert to 16b. 413 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01)); 414 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23)); 415 416 // Descale with rounding. 417 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 418 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 419 // Add the inverse transform. 420 SaturateAndStore4x4_NEON(dst, out01, out23); 421 } 422 } 423 424 //----------------------------------------------------------------------------- 425 // Simple In-loop filtering (Paragraph 15.2) 426 427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0, 428 const uint8x16_t q0, const uint8x16_t q1, 429 int thresh) { 430 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh); 431 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0) 432 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1) 433 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0) 434 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2 435 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2); 436 const uint8x16_t mask = vcgeq_u8(thresh_v, sum); 437 return mask; 438 } 439 440 static int8x16_t FlipSign_NEON(const uint8x16_t v) { 441 const uint8x16_t sign_bit = vdupq_n_u8(0x80); 442 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit)); 443 } 444 445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) { 446 const int8x16_t sign_bit = vdupq_n_s8(0x80); 447 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit)); 448 } 449 450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0, 451 const int8x16_t q0, const int8x16_t q1) { 452 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 453 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1) 454 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0) 455 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0) 456 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0) 457 return s3; 458 } 459 460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) { 461 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 462 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) 463 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) 464 return s2; 465 } 466 467 //------------------------------------------------------------------------------ 468 469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s, 470 const int8x16_t delta, 471 int8x16_t* const op0, 472 int8x16_t* const oq0) { 473 const int8x16_t kCst3 = vdupq_n_s8(0x03); 474 const int8x16_t kCst4 = vdupq_n_s8(0x04); 475 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); 476 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); 477 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); 478 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); 479 *op0 = vqaddq_s8(p0s, delta3); 480 *oq0 = vqsubq_s8(q0s, delta4); 481 } 482 483 #if defined(WEBP_USE_INTRINSICS) 484 485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s, 486 const int8x16_t delta, 487 uint8x16_t* const op0, uint8x16_t* const oq0) { 488 const int8x16_t kCst3 = vdupq_n_s8(0x03); 489 const int8x16_t kCst4 = vdupq_n_s8(0x04); 490 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); 491 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); 492 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); 493 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); 494 const int8x16_t sp0 = vqaddq_s8(p0s, delta3); 495 const int8x16_t sq0 = vqsubq_s8(q0s, delta4); 496 *op0 = FlipSignBack_NEON(sp0); 497 *oq0 = FlipSignBack_NEON(sq0); 498 } 499 500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0, 501 const uint8x16_t q0, const uint8x16_t q1, 502 const uint8x16_t mask, 503 uint8x16_t* const op0, uint8x16_t* const oq0) { 504 const int8x16_t p1s = FlipSign_NEON(p1); 505 const int8x16_t p0s = FlipSign_NEON(p0); 506 const int8x16_t q0s = FlipSign_NEON(q0); 507 const int8x16_t q1s = FlipSign_NEON(q1); 508 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); 509 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); 510 ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0); 511 } 512 513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) { 514 uint8x16_t p1, p0, q0, q1, op0, oq0; 515 Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1); 516 { 517 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh); 518 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0); 519 } 520 Store16x2_NEON(op0, oq0, p, stride); 521 } 522 523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) { 524 uint8x16_t p1, p0, q0, q1, oq0, op0; 525 Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1); 526 { 527 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh); 528 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0); 529 } 530 Store2x16_NEON(op0, oq0, p, stride); 531 } 532 533 #else 534 535 // Load/Store vertical edge 536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 537 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ 538 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ 539 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ 540 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ 541 "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \ 542 "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \ 543 "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \ 544 "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n" 545 546 #define STORE8x2(c1, c2, p, stride) \ 547 "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \ 548 "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \ 549 "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \ 550 "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \ 551 "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \ 552 "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \ 553 "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \ 554 "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n" 555 556 #define QRegs "q0", "q1", "q2", "q3", \ 557 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 558 559 #define FLIP_SIGN_BIT2(a, b, s) \ 560 "veor " #a "," #a "," #s " \n" \ 561 "veor " #b "," #b "," #s " \n" \ 562 563 #define FLIP_SIGN_BIT4(a, b, c, d, s) \ 564 FLIP_SIGN_BIT2(a, b, s) \ 565 FLIP_SIGN_BIT2(c, d, s) \ 566 567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ 568 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ 569 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ 570 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ 571 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ 572 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 573 "vdup.8 q14, " #thresh " \n" \ 574 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ 575 576 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \ 577 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ 578 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ 579 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ 580 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ 581 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ 582 583 #define DO_SIMPLE_FILTER(p0, q0, fl) \ 584 "vmov.i8 q15, #0x03 \n" \ 585 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ 586 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ 587 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ 588 \ 589 "vmov.i8 q15, #0x04 \n" \ 590 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ 591 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ 592 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ 593 594 // Applies filter on 2 pixels (p0 and q0) 595 #define DO_FILTER2(p1, p0, q0, q1, thresh) \ 596 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ 597 "vmov.i8 q10, #0x80 \n" /* sign bit */ \ 598 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ 599 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ 600 "vand q9, q9, q11 \n" /* apply filter mask */ \ 601 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ 602 FLIP_SIGN_BIT2(p0, q0, q10) 603 604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) { 605 __asm__ volatile ( 606 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 607 608 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 609 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 610 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 611 "vld1.u8 {q12}, [%[p]] \n" // q1 612 613 DO_FILTER2(q1, q2, q3, q12, %[thresh]) 614 615 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 616 617 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 618 "vst1.u8 {q3}, [%[p]] \n" // store oq0 619 : [p] "+r"(p) 620 : [stride] "r"(stride), [thresh] "r"(thresh) 621 : "memory", QRegs 622 ); 623 } 624 625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) { 626 __asm__ volatile ( 627 "sub r4, %[p], #2 \n" // base1 = p - 2 628 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 629 "add r5, r4, %[stride] \n" // base2 = base1 + stride 630 631 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 632 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6) 633 "vswp d3, d24 \n" // p1:q1 p0:q3 634 "vswp d5, d26 \n" // q0:q2 q1:q4 635 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4 636 637 DO_FILTER2(q1, q2, q12, q13, %[thresh]) 638 639 "sub %[p], %[p], #1 \n" // p - 1 640 641 "vswp d5, d24 \n" 642 STORE8x2(d4, d5, [%[p]], %[stride]) 643 STORE8x2(d24, d25, [%[p]], %[stride]) 644 645 : [p] "+r"(p) 646 : [stride] "r"(stride), [thresh] "r"(thresh) 647 : "memory", "r4", "r5", "r6", QRegs 648 ); 649 } 650 651 #undef LOAD8x4 652 #undef STORE8x2 653 654 #endif // WEBP_USE_INTRINSICS 655 656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) { 657 uint32_t k; 658 for (k = 3; k != 0; --k) { 659 p += 4 * stride; 660 SimpleVFilter16_NEON(p, stride, thresh); 661 } 662 } 663 664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) { 665 uint32_t k; 666 for (k = 3; k != 0; --k) { 667 p += 4; 668 SimpleHFilter16_NEON(p, stride, thresh); 669 } 670 } 671 672 //------------------------------------------------------------------------------ 673 // Complex In-loop filtering (Paragraph 15.3) 674 675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0, 676 const uint8x16_t q0, const uint8x16_t q1, 677 int hev_thresh) { 678 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); 679 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 680 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 681 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0); 682 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v); 683 return mask; 684 } 685 686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2, 687 const uint8x16_t p1, const uint8x16_t p0, 688 const uint8x16_t q0, const uint8x16_t q1, 689 const uint8x16_t q2, const uint8x16_t q3, 690 int ithresh, int thresh) { 691 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); 692 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) 693 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1) 694 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 695 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2) 696 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1) 697 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 698 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1); 699 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2); 700 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0); 701 const uint8x16_t max12 = vmaxq_u8(max1, max2); 702 const uint8x16_t max123 = vmaxq_u8(max12, max3); 703 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123); 704 const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh); 705 const uint8x16_t mask = vandq_u8(mask1, mask2); 706 return mask; 707 } 708 709 // 4-points filter 710 711 static void ApplyFilter4_NEON( 712 const int8x16_t p1, const int8x16_t p0, 713 const int8x16_t q0, const int8x16_t q1, 714 const int8x16_t delta0, 715 uint8x16_t* const op1, uint8x16_t* const op0, 716 uint8x16_t* const oq0, uint8x16_t* const oq1) { 717 const int8x16_t kCst3 = vdupq_n_s8(0x03); 718 const int8x16_t kCst4 = vdupq_n_s8(0x04); 719 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4); 720 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3); 721 const int8x16_t a1 = vshrq_n_s8(delta1, 3); 722 const int8x16_t a2 = vshrq_n_s8(delta2, 3); 723 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1 724 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2) 725 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1) 726 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3) 727 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3) 728 } 729 730 static void DoFilter4_NEON( 731 const uint8x16_t p1, const uint8x16_t p0, 732 const uint8x16_t q0, const uint8x16_t q1, 733 const uint8x16_t mask, const uint8x16_t hev_mask, 734 uint8x16_t* const op1, uint8x16_t* const op0, 735 uint8x16_t* const oq0, uint8x16_t* const oq1) { 736 // This is a fused version of DoFilter2() calling ApplyFilter2 directly 737 const int8x16_t p1s = FlipSign_NEON(p1); 738 int8x16_t p0s = FlipSign_NEON(p0); 739 int8x16_t q0s = FlipSign_NEON(q0); 740 const int8x16_t q1s = FlipSign_NEON(q1); 741 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 742 743 // do_filter2 part (simple loopfilter on pixels with hev) 744 { 745 const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); 746 const int8x16_t simple_lf_delta = 747 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); 748 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s); 749 } 750 751 // do_filter4 part (complex loopfilter on pixels without hev) 752 { 753 const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s); 754 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 755 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 756 const int8x16_t complex_lf_delta = 757 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 758 ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); 759 } 760 } 761 762 // 6-points filter 763 764 static void ApplyFilter6_NEON( 765 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, 766 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, 767 const int8x16_t delta, 768 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 769 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 770 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7 771 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used 772 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction: 773 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7 774 const int8x8_t delta_lo = vget_low_s8(delta); 775 const int8x8_t delta_hi = vget_high_s8(delta); 776 const int8x8_t kCst9 = vdup_n_s8(9); 777 const int16x8_t kCstm1 = vdupq_n_s16(-1); 778 const int8x8_t kCst18 = vdup_n_s8(18); 779 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1 780 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi); 781 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a 782 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi); 783 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7 784 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7); 785 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6 786 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6); 787 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7 788 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7); 789 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); 790 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); 791 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); 792 793 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1) 794 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1) 795 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2) 796 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2) 797 *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3) 798 *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3) 799 } 800 801 static void DoFilter6_NEON( 802 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, 803 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, 804 const uint8x16_t mask, const uint8x16_t hev_mask, 805 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 806 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 807 // This is a fused version of DoFilter2() calling ApplyFilter2 directly 808 const int8x16_t p2s = FlipSign_NEON(p2); 809 const int8x16_t p1s = FlipSign_NEON(p1); 810 int8x16_t p0s = FlipSign_NEON(p0); 811 int8x16_t q0s = FlipSign_NEON(q0); 812 const int8x16_t q1s = FlipSign_NEON(q1); 813 const int8x16_t q2s = FlipSign_NEON(q2); 814 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 815 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); 816 817 // do_filter2 part (simple loopfilter on pixels with hev) 818 { 819 const int8x16_t simple_lf_delta = 820 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); 821 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s); 822 } 823 824 // do_filter6 part (complex loopfilter on pixels without hev) 825 { 826 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 827 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 828 const int8x16_t complex_lf_delta = 829 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 830 ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, 831 op2, op1, op0, oq0, oq1, oq2); 832 } 833 } 834 835 // on macroblock edges 836 837 static void VFilter16_NEON(uint8_t* p, int stride, 838 int thresh, int ithresh, int hev_thresh) { 839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 840 Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 841 { 842 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, 843 ithresh, thresh); 844 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 845 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 846 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, 847 &op2, &op1, &op0, &oq0, &oq1, &oq2); 848 Store16x2_NEON(op2, op1, p - 2 * stride, stride); 849 Store16x2_NEON(op0, oq0, p + 0 * stride, stride); 850 Store16x2_NEON(oq1, oq2, p + 2 * stride, stride); 851 } 852 } 853 854 static void HFilter16_NEON(uint8_t* p, int stride, 855 int thresh, int ithresh, int hev_thresh) { 856 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 857 Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 858 { 859 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, 860 ithresh, thresh); 861 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 862 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 863 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, 864 &op2, &op1, &op0, &oq0, &oq1, &oq2); 865 Store2x16_NEON(op2, op1, p - 2, stride); 866 Store2x16_NEON(op0, oq0, p + 0, stride); 867 Store2x16_NEON(oq1, oq2, p + 2, stride); 868 } 869 } 870 871 // on three inner edges 872 static void VFilter16i_NEON(uint8_t* p, int stride, 873 int thresh, int ithresh, int hev_thresh) { 874 uint32_t k; 875 uint8x16_t p3, p2, p1, p0; 876 Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0); 877 for (k = 3; k != 0; --k) { 878 uint8x16_t q0, q1, q2, q3; 879 p += 4 * stride; 880 Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3); 881 { 882 const uint8x16_t mask = 883 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); 884 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 885 // p3 and p2 are not just temporary variables here: they will be 886 // re-used for next span. And q2/q3 will become p1/p0 accordingly. 887 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); 888 Store16x4_NEON(p1, p0, p3, p2, p, stride); 889 p1 = q2; 890 p0 = q3; 891 } 892 } 893 } 894 895 #if !defined(WORK_AROUND_GCC) 896 static void HFilter16i_NEON(uint8_t* p, int stride, 897 int thresh, int ithresh, int hev_thresh) { 898 uint32_t k; 899 uint8x16_t p3, p2, p1, p0; 900 Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0); 901 for (k = 3; k != 0; --k) { 902 uint8x16_t q0, q1, q2, q3; 903 p += 4; 904 Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3); 905 { 906 const uint8x16_t mask = 907 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); 908 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 909 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); 910 Store4x16_NEON(p1, p0, p3, p2, p, stride); 911 p1 = q2; 912 p0 = q3; 913 } 914 } 915 } 916 #endif // !WORK_AROUND_GCC 917 918 // 8-pixels wide variant, for chroma filtering 919 static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, 920 int thresh, int ithresh, int hev_thresh) { 921 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 922 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 923 { 924 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, 925 ithresh, thresh); 926 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 927 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 928 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, 929 &op2, &op1, &op0, &oq0, &oq1, &oq2); 930 Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride); 931 Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride); 932 Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); 933 } 934 } 935 static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, 936 int thresh, int ithresh, int hev_thresh) { 937 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 938 u += 4 * stride; 939 v += 4 * stride; 940 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 941 { 942 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, 943 ithresh, thresh); 944 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 945 uint8x16_t op1, op0, oq0, oq1; 946 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); 947 Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride); 948 } 949 } 950 951 #if !defined(WORK_AROUND_GCC) 952 static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, 953 int thresh, int ithresh, int hev_thresh) { 954 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 955 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 956 { 957 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, 958 ithresh, thresh); 959 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 960 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 961 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, 962 &op2, &op1, &op0, &oq0, &oq1, &oq2); 963 Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride); 964 } 965 } 966 967 static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, 968 int thresh, int ithresh, int hev_thresh) { 969 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 970 u += 4; 971 v += 4; 972 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 973 { 974 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, 975 ithresh, thresh); 976 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); 977 uint8x16_t op1, op0, oq0, oq1; 978 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); 979 Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride); 980 } 981 } 982 #endif // !WORK_AROUND_GCC 983 984 //----------------------------------------------------------------------------- 985 // Inverse transforms (Paragraph 14.4) 986 987 // Technically these are unsigned but vqdmulh is only available in signed. 988 // vqdmulh returns high half (effectively >> 16) but also doubles the value, 989 // changing the >> 16 to >> 15 and requiring an additional >> 1. 990 // We use this to our advantage with kC2. The canonical value is 35468. 991 // However, the high bit is set so treating it as signed will give incorrect 992 // results. We avoid this by down shifting by 1 here to clear the highest bit. 993 // Combined with the doubling effect of vqdmulh we get >> 16. 994 // This can not be applied to kC1 because the lowest bit is set. Down shifting 995 // the constant would reduce precision. 996 997 // libwebp uses a trick to avoid some extra addition that libvpx does. 998 // Instead of: 999 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 1000 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 1001 // same issue with kC1 and vqdmulh that we work around by down shifting kC2 1002 1003 static const int16_t kC1 = 20091; 1004 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 1005 1006 #if defined(WEBP_USE_INTRINSICS) 1007 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0, 1008 const int16x8_t in1, 1009 int16x8x2_t* const out) { 1010 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 1011 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 1012 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 1013 // b0 d0 b1 d1 b2 d2 ... 1014 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 1015 } 1016 1017 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { 1018 // {rows} = in0 | in4 1019 // in8 | in12 1020 // B1 = in4 | in12 1021 const int16x8_t B1 = 1022 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); 1023 // C0 = kC1 * in4 | kC1 * in12 1024 // C1 = kC2 * in4 | kC2 * in12 1025 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); 1026 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); 1027 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), 1028 vget_low_s16(rows->val[1])); // in0 + in8 1029 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), 1030 vget_low_s16(rows->val[1])); // in0 - in8 1031 // c = kC2 * in4 - kC1 * in12 1032 // d = kC1 * in4 + kC2 * in12 1033 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); 1034 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); 1035 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b 1036 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c 1037 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c 1038 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c 1039 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); 1040 Transpose8x2_NEON(E0, E1, rows); 1041 } 1042 1043 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { 1044 int16x8x2_t rows; 1045 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); 1046 TransformPass_NEON(&rows); 1047 TransformPass_NEON(&rows); 1048 Add4x4_NEON(rows.val[0], rows.val[1], dst); 1049 } 1050 1051 #else 1052 1053 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { 1054 const int kBPS = BPS; 1055 // kC1, kC2. Padded because vld1.16 loads 8 bytes 1056 const int16_t constants[4] = { kC1, kC2, 0, 0 }; 1057 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */ 1058 __asm__ volatile ( 1059 "vld1.16 {q1, q2}, [%[in]] \n" 1060 "vld1.16 {d0}, [%[constants]] \n" 1061 1062 /* d2: in[0] 1063 * d3: in[8] 1064 * d4: in[4] 1065 * d5: in[12] 1066 */ 1067 "vswp d3, d4 \n" 1068 1069 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16 1070 * q9 = {in[4], in[12]} * kC2 >> 16 1071 */ 1072 "vqdmulh.s16 q8, q2, d0[0] \n" 1073 "vqdmulh.s16 q9, q2, d0[1] \n" 1074 1075 /* d22 = a = in[0] + in[8] 1076 * d23 = b = in[0] - in[8] 1077 */ 1078 "vqadd.s16 d22, d2, d3 \n" 1079 "vqsub.s16 d23, d2, d3 \n" 1080 1081 /* The multiplication should be x * kC1 >> 16 1082 * However, with vqdmulh we get x * kC1 * 2 >> 16 1083 * (multiply, double, return high half) 1084 * We avoided this in kC2 by pre-shifting the constant. 1085 * q8 = in[4]/[12] * kC1 >> 16 1086 */ 1087 "vshr.s16 q8, q8, #1 \n" 1088 1089 /* Add {in[4], in[12]} back after the multiplication. This is handled by 1090 * adding 1 << 16 to kC1 in the libwebp C code. 1091 */ 1092 "vqadd.s16 q8, q2, q8 \n" 1093 1094 /* d20 = c = in[4]*kC2 - in[12]*kC1 1095 * d21 = d = in[4]*kC1 + in[12]*kC2 1096 */ 1097 "vqsub.s16 d20, d18, d17 \n" 1098 "vqadd.s16 d21, d19, d16 \n" 1099 1100 /* d2 = tmp[0] = a + d 1101 * d3 = tmp[1] = b + c 1102 * d4 = tmp[2] = b - c 1103 * d5 = tmp[3] = a - d 1104 */ 1105 "vqadd.s16 d2, d22, d21 \n" 1106 "vqadd.s16 d3, d23, d20 \n" 1107 "vqsub.s16 d4, d23, d20 \n" 1108 "vqsub.s16 d5, d22, d21 \n" 1109 1110 "vzip.16 q1, q2 \n" 1111 "vzip.16 q1, q2 \n" 1112 1113 "vswp d3, d4 \n" 1114 1115 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 1116 * q9 = {tmp[4], tmp[12]} * kC2 >> 16 1117 */ 1118 "vqdmulh.s16 q8, q2, d0[0] \n" 1119 "vqdmulh.s16 q9, q2, d0[1] \n" 1120 1121 /* d22 = a = tmp[0] + tmp[8] 1122 * d23 = b = tmp[0] - tmp[8] 1123 */ 1124 "vqadd.s16 d22, d2, d3 \n" 1125 "vqsub.s16 d23, d2, d3 \n" 1126 1127 /* See long winded explanations prior */ 1128 "vshr.s16 q8, q8, #1 \n" 1129 "vqadd.s16 q8, q2, q8 \n" 1130 1131 /* d20 = c = in[4]*kC2 - in[12]*kC1 1132 * d21 = d = in[4]*kC1 + in[12]*kC2 1133 */ 1134 "vqsub.s16 d20, d18, d17 \n" 1135 "vqadd.s16 d21, d19, d16 \n" 1136 1137 /* d2 = tmp[0] = a + d 1138 * d3 = tmp[1] = b + c 1139 * d4 = tmp[2] = b - c 1140 * d5 = tmp[3] = a - d 1141 */ 1142 "vqadd.s16 d2, d22, d21 \n" 1143 "vqadd.s16 d3, d23, d20 \n" 1144 "vqsub.s16 d4, d23, d20 \n" 1145 "vqsub.s16 d5, d22, d21 \n" 1146 1147 "vld1.32 d6[0], [%[dst]], %[kBPS] \n" 1148 "vld1.32 d6[1], [%[dst]], %[kBPS] \n" 1149 "vld1.32 d7[0], [%[dst]], %[kBPS] \n" 1150 "vld1.32 d7[1], [%[dst]], %[kBPS] \n" 1151 1152 "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 1153 1154 /* (val) + 4 >> 3 */ 1155 "vrshr.s16 d2, d2, #3 \n" 1156 "vrshr.s16 d3, d3, #3 \n" 1157 "vrshr.s16 d4, d4, #3 \n" 1158 "vrshr.s16 d5, d5, #3 \n" 1159 1160 "vzip.16 q1, q2 \n" 1161 "vzip.16 q1, q2 \n" 1162 1163 /* Must accumulate before saturating */ 1164 "vmovl.u8 q8, d6 \n" 1165 "vmovl.u8 q9, d7 \n" 1166 1167 "vqadd.s16 q1, q1, q8 \n" 1168 "vqadd.s16 q2, q2, q9 \n" 1169 1170 "vqmovun.s16 d0, q1 \n" 1171 "vqmovun.s16 d1, q2 \n" 1172 1173 "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 1174 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 1175 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 1176 "vst1.32 d1[1], [%[dst]] \n" 1177 1178 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 1179 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 1180 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 1181 ); 1182 } 1183 1184 #endif // WEBP_USE_INTRINSICS 1185 1186 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) { 1187 TransformOne_NEON(in, dst); 1188 if (do_two) { 1189 TransformOne_NEON(in + 16, dst + 4); 1190 } 1191 } 1192 1193 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { 1194 const int16x8_t DC = vdupq_n_s16(in[0]); 1195 Add4x4_NEON(DC, DC, dst); 1196 } 1197 1198 //------------------------------------------------------------------------------ 1199 1200 #define STORE_WHT(dst, col, rows) do { \ 1201 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \ 1202 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \ 1203 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \ 1204 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ 1205 } while (0) 1206 1207 static void TransformWHT_NEON(const int16_t* in, int16_t* out) { 1208 int32x4x4_t tmp; 1209 1210 { 1211 // Load the source. 1212 const int16x4_t in00_03 = vld1_s16(in + 0); 1213 const int16x4_t in04_07 = vld1_s16(in + 4); 1214 const int16x4_t in08_11 = vld1_s16(in + 8); 1215 const int16x4_t in12_15 = vld1_s16(in + 12); 1216 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15] 1217 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11] 1218 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11] 1219 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15] 1220 tmp.val[0] = vaddq_s32(a0, a1); 1221 tmp.val[1] = vaddq_s32(a3, a2); 1222 tmp.val[2] = vsubq_s32(a0, a1); 1223 tmp.val[3] = vsubq_s32(a3, a2); 1224 // Arrange the temporary results column-wise. 1225 tmp = Transpose4x4_NEON(tmp); 1226 } 1227 1228 { 1229 const int32x4_t kCst3 = vdupq_n_s32(3); 1230 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder 1231 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]); 1232 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]); 1233 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]); 1234 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]); 1235 1236 tmp.val[0] = vaddq_s32(a0, a1); 1237 tmp.val[1] = vaddq_s32(a3, a2); 1238 tmp.val[2] = vsubq_s32(a0, a1); 1239 tmp.val[3] = vsubq_s32(a3, a2); 1240 1241 // right shift the results by 3. 1242 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3); 1243 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3); 1244 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3); 1245 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3); 1246 1247 STORE_WHT(out, 0, tmp); 1248 STORE_WHT(out, 1, tmp); 1249 STORE_WHT(out, 2, tmp); 1250 STORE_WHT(out, 3, tmp); 1251 } 1252 } 1253 1254 #undef STORE_WHT 1255 1256 //------------------------------------------------------------------------------ 1257 1258 #define MUL(a, b) (((a) * (b)) >> 16) 1259 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { 1260 static const int kC1_full = 20091 + (1 << 16); 1261 static const int kC2_full = 35468; 1262 const int16x4_t A = vld1_dup_s16(in); 1263 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); 1264 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); 1265 const int c1 = MUL(in[1], kC2_full); 1266 const int d1 = MUL(in[1], kC1_full); 1267 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | 1268 (uint64_t)( c1 & 0xffff) << 16 | 1269 (uint64_t)(-c1 & 0xffff) << 32 | 1270 (uint64_t)(-d1 & 0xffff) << 48; 1271 const int16x4_t CD = vcreate_s16(cd); 1272 const int16x4_t B = vqadd_s16(A, CD); 1273 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); 1274 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); 1275 Add4x4_NEON(m0_m1, m2_m3, dst); 1276 } 1277 #undef MUL 1278 1279 //------------------------------------------------------------------------------ 1280 // 4x4 1281 1282 static void DC4_NEON(uint8_t* dst) { // DC 1283 const uint8x8_t A = vld1_u8(dst - BPS); // top row 1284 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top 1285 const uint16x4_t p1 = vpadd_u16(p0, p0); 1286 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); 1287 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); 1288 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); 1289 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); 1290 const uint16x8_t s0 = vaddq_u16(L0, L1); 1291 const uint16x8_t s1 = vaddq_u16(L2, L3); 1292 const uint16x8_t s01 = vaddq_u16(s0, s1); 1293 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); 1294 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3 1295 const uint8x8_t dc = vdup_lane_u8(dc0, 0); 1296 int i; 1297 for (i = 0; i < 4; ++i) { 1298 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0); 1299 } 1300 } 1301 1302 // TrueMotion (4x4 + 8x8) 1303 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) { 1304 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' 1305 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' 1306 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] 1307 int y; 1308 for (y = 0; y < size; y += 4) { 1309 // left edge 1310 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); 1311 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); 1312 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); 1313 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); 1314 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] 1315 const int16x8_t r1 = vaddq_s16(L1, d); 1316 const int16x8_t r2 = vaddq_s16(L2, d); 1317 const int16x8_t r3 = vaddq_s16(L3, d); 1318 // Saturate and store the result. 1319 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); 1320 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); 1321 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); 1322 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); 1323 if (size == 4) { 1324 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); 1325 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); 1326 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); 1327 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); 1328 } else { 1329 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32); 1330 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32); 1331 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32); 1332 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32); 1333 } 1334 dst += 4 * BPS; 1335 } 1336 } 1337 1338 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); } 1339 1340 static void VE4_NEON(uint8_t* dst) { // vertical 1341 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. 1342 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row 1343 const uint64x1_t A1 = vshr_n_u64(A0, 8); 1344 const uint64x1_t A2 = vshr_n_u64(A0, 16); 1345 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); 1346 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); 1347 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); 1348 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00); 1349 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0); 1350 int i; 1351 for (i = 0; i < 4; ++i) { 1352 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0); 1353 } 1354 } 1355 1356 static void RD4_NEON(uint8_t* dst) { // Down-right 1357 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); 1358 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); 1359 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); 1360 const uint32_t I = dst[-1 + 0 * BPS]; 1361 const uint32_t J = dst[-1 + 1 * BPS]; 1362 const uint32_t K = dst[-1 + 2 * BPS]; 1363 const uint32_t L = dst[-1 + 3 * BPS]; 1364 const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24)); 1365 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); 1366 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); 1367 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); 1368 const uint8_t D = vget_lane_u8(XABCD_u8, 4); 1369 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); 1370 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); 1371 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); 1372 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); 1373 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); 1374 const uint32x2_t r3 = vreinterpret_u32_u8(avg2); 1375 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); 1376 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); 1377 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); 1378 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); 1379 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); 1380 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); 1381 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); 1382 } 1383 1384 static void LD4_NEON(uint8_t* dst) { // Down-left 1385 // Note using the same shift trick as VE4() is slower here. 1386 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); 1387 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); 1388 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2); 1389 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6); 1390 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0); 1391 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); 1392 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); 1393 const uint32x2_t r0 = vreinterpret_u32_u8(avg2); 1394 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); 1395 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); 1396 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); 1397 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); 1398 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); 1399 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); 1400 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); 1401 } 1402 1403 //------------------------------------------------------------------------------ 1404 // Chroma 1405 1406 static void VE8uv_NEON(uint8_t* dst) { // vertical 1407 const uint8x8_t top = vld1_u8(dst - BPS); 1408 int j; 1409 for (j = 0; j < 8; ++j) { 1410 vst1_u8(dst + j * BPS, top); 1411 } 1412 } 1413 1414 static void HE8uv_NEON(uint8_t* dst) { // horizontal 1415 int j; 1416 for (j = 0; j < 8; ++j) { 1417 const uint8x8_t left = vld1_dup_u8(dst - 1); 1418 vst1_u8(dst, left); 1419 dst += BPS; 1420 } 1421 } 1422 1423 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { 1424 uint16x8_t sum_top; 1425 uint16x8_t sum_left; 1426 uint8x8_t dc0; 1427 1428 if (do_top) { 1429 const uint8x8_t A = vld1_u8(dst - BPS); // top row 1430 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top 1431 const uint16x4_t p1 = vpadd_u16(p0, p0); 1432 const uint16x4_t p2 = vpadd_u16(p1, p1); 1433 sum_top = vcombine_u16(p2, p2); 1434 } 1435 1436 if (do_left) { 1437 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); 1438 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); 1439 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); 1440 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); 1441 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); 1442 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); 1443 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); 1444 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); 1445 const uint16x8_t s0 = vaddq_u16(L0, L1); 1446 const uint16x8_t s1 = vaddq_u16(L2, L3); 1447 const uint16x8_t s2 = vaddq_u16(L4, L5); 1448 const uint16x8_t s3 = vaddq_u16(L6, L7); 1449 const uint16x8_t s01 = vaddq_u16(s0, s1); 1450 const uint16x8_t s23 = vaddq_u16(s2, s3); 1451 sum_left = vaddq_u16(s01, s23); 1452 } 1453 1454 if (do_top && do_left) { 1455 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 1456 dc0 = vrshrn_n_u16(sum, 4); 1457 } else if (do_top) { 1458 dc0 = vrshrn_n_u16(sum_top, 3); 1459 } else if (do_left) { 1460 dc0 = vrshrn_n_u16(sum_left, 3); 1461 } else { 1462 dc0 = vdup_n_u8(0x80); 1463 } 1464 1465 { 1466 const uint8x8_t dc = vdup_lane_u8(dc0, 0); 1467 int i; 1468 for (i = 0; i < 8; ++i) { 1469 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc)); 1470 } 1471 } 1472 } 1473 1474 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); } 1475 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); } 1476 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); } 1477 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); } 1478 1479 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); } 1480 1481 //------------------------------------------------------------------------------ 1482 // 16x16 1483 1484 static void VE16_NEON(uint8_t* dst) { // vertical 1485 const uint8x16_t top = vld1q_u8(dst - BPS); 1486 int j; 1487 for (j = 0; j < 16; ++j) { 1488 vst1q_u8(dst + j * BPS, top); 1489 } 1490 } 1491 1492 static void HE16_NEON(uint8_t* dst) { // horizontal 1493 int j; 1494 for (j = 0; j < 16; ++j) { 1495 const uint8x16_t left = vld1q_dup_u8(dst - 1); 1496 vst1q_u8(dst, left); 1497 dst += BPS; 1498 } 1499 } 1500 1501 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { 1502 uint16x8_t sum_top; 1503 uint16x8_t sum_left; 1504 uint8x8_t dc0; 1505 1506 if (do_top) { 1507 const uint8x16_t A = vld1q_u8(dst - BPS); // top row 1508 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top 1509 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); 1510 const uint16x4_t p2 = vpadd_u16(p1, p1); 1511 const uint16x4_t p3 = vpadd_u16(p2, p2); 1512 sum_top = vcombine_u16(p3, p3); 1513 } 1514 1515 if (do_left) { 1516 int i; 1517 sum_left = vdupq_n_u16(0); 1518 for (i = 0; i < 16; i += 8) { 1519 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); 1520 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); 1521 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); 1522 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); 1523 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); 1524 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); 1525 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); 1526 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); 1527 const uint16x8_t s0 = vaddq_u16(L0, L1); 1528 const uint16x8_t s1 = vaddq_u16(L2, L3); 1529 const uint16x8_t s2 = vaddq_u16(L4, L5); 1530 const uint16x8_t s3 = vaddq_u16(L6, L7); 1531 const uint16x8_t s01 = vaddq_u16(s0, s1); 1532 const uint16x8_t s23 = vaddq_u16(s2, s3); 1533 const uint16x8_t sum = vaddq_u16(s01, s23); 1534 sum_left = vaddq_u16(sum_left, sum); 1535 } 1536 } 1537 1538 if (do_top && do_left) { 1539 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 1540 dc0 = vrshrn_n_u16(sum, 5); 1541 } else if (do_top) { 1542 dc0 = vrshrn_n_u16(sum_top, 4); 1543 } else if (do_left) { 1544 dc0 = vrshrn_n_u16(sum_left, 4); 1545 } else { 1546 dc0 = vdup_n_u8(0x80); 1547 } 1548 1549 { 1550 const uint8x16_t dc = vdupq_lane_u8(dc0, 0); 1551 int i; 1552 for (i = 0; i < 16; ++i) { 1553 vst1q_u8(dst + i * BPS, dc); 1554 } 1555 } 1556 } 1557 1558 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); } 1559 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); } 1560 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); } 1561 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); } 1562 1563 static void TM16_NEON(uint8_t* dst) { 1564 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' 1565 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' 1566 // A[c] - A[-1] 1567 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL)); 1568 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL)); 1569 int y; 1570 for (y = 0; y < 16; y += 4) { 1571 // left edge 1572 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); 1573 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); 1574 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); 1575 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); 1576 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] 1577 const int16x8_t r1_lo = vaddq_s16(L1, d_lo); 1578 const int16x8_t r2_lo = vaddq_s16(L2, d_lo); 1579 const int16x8_t r3_lo = vaddq_s16(L3, d_lo); 1580 const int16x8_t r0_hi = vaddq_s16(L0, d_hi); 1581 const int16x8_t r1_hi = vaddq_s16(L1, d_hi); 1582 const int16x8_t r2_hi = vaddq_s16(L2, d_hi); 1583 const int16x8_t r3_hi = vaddq_s16(L3, d_hi); 1584 // Saturate and store the result. 1585 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi)); 1586 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi)); 1587 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi)); 1588 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi)); 1589 vst1q_u8(dst + 0 * BPS, row0); 1590 vst1q_u8(dst + 1 * BPS, row1); 1591 vst1q_u8(dst + 2 * BPS, row2); 1592 vst1q_u8(dst + 3 * BPS, row3); 1593 dst += 4 * BPS; 1594 } 1595 } 1596 1597 //------------------------------------------------------------------------------ 1598 // Entry point 1599 1600 extern void VP8DspInitNEON(void); 1601 1602 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { 1603 VP8Transform = TransformTwo_NEON; 1604 VP8TransformAC3 = TransformAC3_NEON; 1605 VP8TransformDC = TransformDC_NEON; 1606 VP8TransformWHT = TransformWHT_NEON; 1607 1608 VP8VFilter16 = VFilter16_NEON; 1609 VP8VFilter16i = VFilter16i_NEON; 1610 VP8HFilter16 = HFilter16_NEON; 1611 #if !defined(WORK_AROUND_GCC) 1612 VP8HFilter16i = HFilter16i_NEON; 1613 #endif 1614 VP8VFilter8 = VFilter8_NEON; 1615 VP8VFilter8i = VFilter8i_NEON; 1616 #if !defined(WORK_AROUND_GCC) 1617 VP8HFilter8 = HFilter8_NEON; 1618 VP8HFilter8i = HFilter8i_NEON; 1619 #endif 1620 VP8SimpleVFilter16 = SimpleVFilter16_NEON; 1621 VP8SimpleHFilter16 = SimpleHFilter16_NEON; 1622 VP8SimpleVFilter16i = SimpleVFilter16i_NEON; 1623 VP8SimpleHFilter16i = SimpleHFilter16i_NEON; 1624 1625 VP8PredLuma4[0] = DC4_NEON; 1626 VP8PredLuma4[1] = TM4_NEON; 1627 VP8PredLuma4[2] = VE4_NEON; 1628 VP8PredLuma4[4] = RD4_NEON; 1629 VP8PredLuma4[6] = LD4_NEON; 1630 1631 VP8PredLuma16[0] = DC16TopLeft_NEON; 1632 VP8PredLuma16[1] = TM16_NEON; 1633 VP8PredLuma16[2] = VE16_NEON; 1634 VP8PredLuma16[3] = HE16_NEON; 1635 VP8PredLuma16[4] = DC16NoTop_NEON; 1636 VP8PredLuma16[5] = DC16NoLeft_NEON; 1637 VP8PredLuma16[6] = DC16NoTopLeft_NEON; 1638 1639 VP8PredChroma8[0] = DC8uv_NEON; 1640 VP8PredChroma8[1] = TM8uv_NEON; 1641 VP8PredChroma8[2] = VE8uv_NEON; 1642 VP8PredChroma8[3] = HE8uv_NEON; 1643 VP8PredChroma8[4] = DC8uvNoTop_NEON; 1644 VP8PredChroma8[5] = DC8uvNoLeft_NEON; 1645 VP8PredChroma8[6] = DC8uvNoTopLeft_NEON; 1646 } 1647 1648 #else // !WEBP_USE_NEON 1649 1650 WEBP_DSP_INIT_STUB(VP8DspInitNEON) 1651 1652 #endif // WEBP_USE_NEON 1653