1 // Copyright 2012 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // ARM NEON version of dsp functions and loop filtering. 11 // 12 // Authors: Somnath Banerjee (somnath (at) google.com) 13 // Johann Koenig (johannkoenig (at) google.com) 14 15 #include "./dsp.h" 16 17 #if defined(WEBP_USE_NEON) 18 19 #include "./neon.h" 20 #include "../dec/vp8i_dec.h" 21 22 //------------------------------------------------------------------------------ 23 // NxM Loading functions 24 25 // Load/Store vertical edge 26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 27 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ 28 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ 29 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ 30 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ 31 "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \ 32 "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \ 33 "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \ 34 "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n" 35 36 #define STORE8x2(c1, c2, p, stride) \ 37 "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \ 38 "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \ 39 "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \ 40 "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \ 41 "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \ 42 "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \ 43 "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \ 44 "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n" 45 46 #if !defined(WORK_AROUND_GCC) 47 48 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation 49 // (register alloc, probably). The variants somewhat mitigate the problem, but 50 // not quite. HFilter16i() remains problematic. 51 static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { 52 const uint8x8_t zero = vdup_n_u8(0); 53 uint8x8x4_t out; 54 INIT_VECTOR4(out, zero, zero, zero, zero); 55 out = vld4_lane_u8(src + 0 * stride, out, 0); 56 out = vld4_lane_u8(src + 1 * stride, out, 1); 57 out = vld4_lane_u8(src + 2 * stride, out, 2); 58 out = vld4_lane_u8(src + 3 * stride, out, 3); 59 out = vld4_lane_u8(src + 4 * stride, out, 4); 60 out = vld4_lane_u8(src + 5 * stride, out, 5); 61 out = vld4_lane_u8(src + 6 * stride, out, 6); 62 out = vld4_lane_u8(src + 7 * stride, out, 7); 63 return out; 64 } 65 66 static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, 67 uint8x16_t* const p1, uint8x16_t* const p0, 68 uint8x16_t* const q0, uint8x16_t* const q1) { 69 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7] 70 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15] 71 const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride); 72 const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride); 73 *p1 = vcombine_u8(row0.val[0], row8.val[0]); 74 *p0 = vcombine_u8(row0.val[1], row8.val[1]); 75 *q0 = vcombine_u8(row0.val[2], row8.val[2]); 76 *q1 = vcombine_u8(row0.val[3], row8.val[3]); 77 } 78 79 #else // WORK_AROUND_GCC 80 81 #define LOADQ_LANE_32b(VALUE, LANE) do { \ 82 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \ 83 src += stride; \ 84 } while (0) 85 86 static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, 87 uint8x16_t* const p1, uint8x16_t* const p0, 88 uint8x16_t* const q0, uint8x16_t* const q1) { 89 const uint32x4_t zero = vdupq_n_u32(0); 90 uint32x4x4_t in; 91 INIT_VECTOR4(in, zero, zero, zero, zero); 92 src -= 2; 93 LOADQ_LANE_32b(in.val[0], 0); 94 LOADQ_LANE_32b(in.val[1], 0); 95 LOADQ_LANE_32b(in.val[2], 0); 96 LOADQ_LANE_32b(in.val[3], 0); 97 LOADQ_LANE_32b(in.val[0], 1); 98 LOADQ_LANE_32b(in.val[1], 1); 99 LOADQ_LANE_32b(in.val[2], 1); 100 LOADQ_LANE_32b(in.val[3], 1); 101 LOADQ_LANE_32b(in.val[0], 2); 102 LOADQ_LANE_32b(in.val[1], 2); 103 LOADQ_LANE_32b(in.val[2], 2); 104 LOADQ_LANE_32b(in.val[3], 2); 105 LOADQ_LANE_32b(in.val[0], 3); 106 LOADQ_LANE_32b(in.val[1], 3); 107 LOADQ_LANE_32b(in.val[2], 3); 108 LOADQ_LANE_32b(in.val[3], 3); 109 // Transpose four 4x4 parts: 110 { 111 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]), 112 vreinterpretq_u8_u32(in.val[1])); 113 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]), 114 vreinterpretq_u8_u32(in.val[3])); 115 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]), 116 vreinterpretq_u16_u8(row23.val[0])); 117 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]), 118 vreinterpretq_u16_u8(row23.val[1])); 119 *p1 = vreinterpretq_u8_u16(row02.val[0]); 120 *p0 = vreinterpretq_u8_u16(row13.val[0]); 121 *q0 = vreinterpretq_u8_u16(row02.val[1]); 122 *q1 = vreinterpretq_u8_u16(row13.val[1]); 123 } 124 } 125 #undef LOADQ_LANE_32b 126 127 #endif // !WORK_AROUND_GCC 128 129 static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride, 130 uint8x16_t* const p3, uint8x16_t* const p2, 131 uint8x16_t* const p1, uint8x16_t* const p0, 132 uint8x16_t* const q0, uint8x16_t* const q1, 133 uint8x16_t* const q2, uint8x16_t* const q3) { 134 Load4x16(src - 2, stride, p3, p2, p1, p0); 135 Load4x16(src + 2, stride, q0, q1, q2, q3); 136 } 137 138 static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride, 139 uint8x16_t* const p1, uint8x16_t* const p0, 140 uint8x16_t* const q0, uint8x16_t* const q1) { 141 *p1 = vld1q_u8(src - 2 * stride); 142 *p0 = vld1q_u8(src - 1 * stride); 143 *q0 = vld1q_u8(src + 0 * stride); 144 *q1 = vld1q_u8(src + 1 * stride); 145 } 146 147 static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride, 148 uint8x16_t* const p3, uint8x16_t* const p2, 149 uint8x16_t* const p1, uint8x16_t* const p0, 150 uint8x16_t* const q0, uint8x16_t* const q1, 151 uint8x16_t* const q2, uint8x16_t* const q3) { 152 Load16x4(src - 2 * stride, stride, p3, p2, p1, p0); 153 Load16x4(src + 2 * stride, stride, q0, q1, q2, q3); 154 } 155 156 static WEBP_INLINE void Load8x8x2(const uint8_t* const u, 157 const uint8_t* const v, 158 int stride, 159 uint8x16_t* const p3, uint8x16_t* const p2, 160 uint8x16_t* const p1, uint8x16_t* const p0, 161 uint8x16_t* const q0, uint8x16_t* const q1, 162 uint8x16_t* const q2, uint8x16_t* const q3) { 163 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination 164 // and the v-samples on the higher half. 165 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride)); 166 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride)); 167 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride)); 168 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride)); 169 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride)); 170 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride)); 171 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride)); 172 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride)); 173 } 174 175 #if !defined(WORK_AROUND_GCC) 176 177 #define LOAD_UV_8(ROW) \ 178 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride)) 179 180 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u, 181 const uint8_t* const v, 182 int stride, 183 uint8x16_t* const p3, uint8x16_t* const p2, 184 uint8x16_t* const p1, uint8x16_t* const p0, 185 uint8x16_t* const q0, uint8x16_t* const q1, 186 uint8x16_t* const q2, uint8x16_t* const q3) { 187 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination 188 // and the v-samples on the higher half. 189 const uint8x16_t row0 = LOAD_UV_8(0); 190 const uint8x16_t row1 = LOAD_UV_8(1); 191 const uint8x16_t row2 = LOAD_UV_8(2); 192 const uint8x16_t row3 = LOAD_UV_8(3); 193 const uint8x16_t row4 = LOAD_UV_8(4); 194 const uint8x16_t row5 = LOAD_UV_8(5); 195 const uint8x16_t row6 = LOAD_UV_8(6); 196 const uint8x16_t row7 = LOAD_UV_8(7); 197 // Perform two side-by-side 8x8 transposes 198 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07 199 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ... 200 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ... 201 // u30 u31 u32 u33 u34 u35 u36 u37 | ... 202 // u40 u41 u42 u43 u44 u45 u46 u47 | ... 203 // u50 u51 u52 u53 u54 u55 u56 u57 | ... 204 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ... 205 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ... 206 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ... 207 // u01 u11 u03 u13 ... 208 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ... 209 // u21 u31 u23 u33 ... 210 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ... 211 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ... 212 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]), 213 vreinterpretq_u16_u8(row23.val[0])); 214 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]), 215 vreinterpretq_u16_u8(row23.val[1])); 216 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]), 217 vreinterpretq_u16_u8(row67.val[0])); 218 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]), 219 vreinterpretq_u16_u8(row67.val[1])); 220 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]), 221 vreinterpretq_u32_u16(row46.val[0])); 222 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]), 223 vreinterpretq_u32_u16(row46.val[1])); 224 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]), 225 vreinterpretq_u32_u16(row57.val[0])); 226 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]), 227 vreinterpretq_u32_u16(row57.val[1])); 228 *p3 = vreinterpretq_u8_u32(row04.val[0]); 229 *p2 = vreinterpretq_u8_u32(row15.val[0]); 230 *p1 = vreinterpretq_u8_u32(row26.val[0]); 231 *p0 = vreinterpretq_u8_u32(row37.val[0]); 232 *q0 = vreinterpretq_u8_u32(row04.val[1]); 233 *q1 = vreinterpretq_u8_u32(row15.val[1]); 234 *q2 = vreinterpretq_u8_u32(row26.val[1]); 235 *q3 = vreinterpretq_u8_u32(row37.val[1]); 236 } 237 #undef LOAD_UV_8 238 239 #endif // !WORK_AROUND_GCC 240 241 static WEBP_INLINE void Store2x8(const uint8x8x2_t v, 242 uint8_t* const dst, int stride) { 243 vst2_lane_u8(dst + 0 * stride, v, 0); 244 vst2_lane_u8(dst + 1 * stride, v, 1); 245 vst2_lane_u8(dst + 2 * stride, v, 2); 246 vst2_lane_u8(dst + 3 * stride, v, 3); 247 vst2_lane_u8(dst + 4 * stride, v, 4); 248 vst2_lane_u8(dst + 5 * stride, v, 5); 249 vst2_lane_u8(dst + 6 * stride, v, 6); 250 vst2_lane_u8(dst + 7 * stride, v, 7); 251 } 252 253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0, 254 uint8_t* const dst, int stride) { 255 uint8x8x2_t lo, hi; 256 lo.val[0] = vget_low_u8(p0); 257 lo.val[1] = vget_low_u8(q0); 258 hi.val[0] = vget_high_u8(p0); 259 hi.val[1] = vget_high_u8(q0); 260 Store2x8(lo, dst - 1 + 0 * stride, stride); 261 Store2x8(hi, dst - 1 + 8 * stride, stride); 262 } 263 264 #if !defined(WORK_AROUND_GCC) 265 static WEBP_INLINE void Store4x8(const uint8x8x4_t v, 266 uint8_t* const dst, int stride) { 267 vst4_lane_u8(dst + 0 * stride, v, 0); 268 vst4_lane_u8(dst + 1 * stride, v, 1); 269 vst4_lane_u8(dst + 2 * stride, v, 2); 270 vst4_lane_u8(dst + 3 * stride, v, 3); 271 vst4_lane_u8(dst + 4 * stride, v, 4); 272 vst4_lane_u8(dst + 5 * stride, v, 5); 273 vst4_lane_u8(dst + 6 * stride, v, 6); 274 vst4_lane_u8(dst + 7 * stride, v, 7); 275 } 276 277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, 278 const uint8x16_t q0, const uint8x16_t q1, 279 uint8_t* const dst, int stride) { 280 uint8x8x4_t lo, hi; 281 INIT_VECTOR4(lo, 282 vget_low_u8(p1), vget_low_u8(p0), 283 vget_low_u8(q0), vget_low_u8(q1)); 284 INIT_VECTOR4(hi, 285 vget_high_u8(p1), vget_high_u8(p0), 286 vget_high_u8(q0), vget_high_u8(q1)); 287 Store4x8(lo, dst - 2 + 0 * stride, stride); 288 Store4x8(hi, dst - 2 + 8 * stride, stride); 289 } 290 #endif // !WORK_AROUND_GCC 291 292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0, 293 uint8_t* const dst, int stride) { 294 vst1q_u8(dst - stride, p0); 295 vst1q_u8(dst, q0); 296 } 297 298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0, 299 const uint8x16_t q0, const uint8x16_t q1, 300 uint8_t* const dst, int stride) { 301 Store16x2(p1, p0, dst - stride, stride); 302 Store16x2(q0, q1, dst + stride, stride); 303 } 304 305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0, 306 uint8_t* const u, uint8_t* const v, 307 int stride) { 308 // p0 and q0 contain the u+v samples packed in low/high halves. 309 vst1_u8(u - stride, vget_low_u8(p0)); 310 vst1_u8(u, vget_low_u8(q0)); 311 vst1_u8(v - stride, vget_high_u8(p0)); 312 vst1_u8(v, vget_high_u8(q0)); 313 } 314 315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0, 316 const uint8x16_t q0, const uint8x16_t q1, 317 uint8_t* const u, uint8_t* const v, 318 int stride) { 319 // The p1...q1 registers contain the u+v samples packed in low/high halves. 320 Store8x2x2(p1, p0, u - stride, v - stride, stride); 321 Store8x2x2(q0, q1, u + stride, v + stride, stride); 322 } 323 324 #if !defined(WORK_AROUND_GCC) 325 326 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \ 327 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \ 328 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \ 329 (DST) += stride; \ 330 } while (0) 331 332 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1, 333 const uint8x16_t p0, const uint8x16_t q0, 334 const uint8x16_t q1, const uint8x16_t q2, 335 uint8_t* u, uint8_t* v, 336 int stride) { 337 uint8x8x3_t u0, u1, v0, v1; 338 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)); 339 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)); 340 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0)); 341 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2)); 342 STORE6_LANE(u, u0, u1, 0); 343 STORE6_LANE(u, u0, u1, 1); 344 STORE6_LANE(u, u0, u1, 2); 345 STORE6_LANE(u, u0, u1, 3); 346 STORE6_LANE(u, u0, u1, 4); 347 STORE6_LANE(u, u0, u1, 5); 348 STORE6_LANE(u, u0, u1, 6); 349 STORE6_LANE(u, u0, u1, 7); 350 STORE6_LANE(v, v0, v1, 0); 351 STORE6_LANE(v, v0, v1, 1); 352 STORE6_LANE(v, v0, v1, 2); 353 STORE6_LANE(v, v0, v1, 3); 354 STORE6_LANE(v, v0, v1, 4); 355 STORE6_LANE(v, v0, v1, 5); 356 STORE6_LANE(v, v0, v1, 6); 357 STORE6_LANE(v, v0, v1, 7); 358 } 359 #undef STORE6_LANE 360 361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, 362 const uint8x16_t q0, const uint8x16_t q1, 363 uint8_t* const u, uint8_t* const v, 364 int stride) { 365 uint8x8x4_t u0, v0; 366 INIT_VECTOR4(u0, 367 vget_low_u8(p1), vget_low_u8(p0), 368 vget_low_u8(q0), vget_low_u8(q1)); 369 INIT_VECTOR4(v0, 370 vget_high_u8(p1), vget_high_u8(p0), 371 vget_high_u8(q0), vget_high_u8(q1)); 372 vst4_lane_u8(u - 2 + 0 * stride, u0, 0); 373 vst4_lane_u8(u - 2 + 1 * stride, u0, 1); 374 vst4_lane_u8(u - 2 + 2 * stride, u0, 2); 375 vst4_lane_u8(u - 2 + 3 * stride, u0, 3); 376 vst4_lane_u8(u - 2 + 4 * stride, u0, 4); 377 vst4_lane_u8(u - 2 + 5 * stride, u0, 5); 378 vst4_lane_u8(u - 2 + 6 * stride, u0, 6); 379 vst4_lane_u8(u - 2 + 7 * stride, u0, 7); 380 vst4_lane_u8(v - 2 + 0 * stride, v0, 0); 381 vst4_lane_u8(v - 2 + 1 * stride, v0, 1); 382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2); 383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3); 384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4); 385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5); 386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6); 387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7); 388 } 389 390 #endif // !WORK_AROUND_GCC 391 392 // Zero extend 'v' to an int16x8_t. 393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) { 394 return vreinterpretq_s16_u16(vmovl_u8(v)); 395 } 396 397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 398 // to the corresponding rows of 'dst'. 399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 400 const int16x8_t dst01, 401 const int16x8_t dst23) { 402 // Unsigned saturate to 8b. 403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 405 406 // Store the results. 407 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); 408 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); 409 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); 410 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); 411 } 412 413 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, 414 uint8_t* const dst) { 415 uint32x2_t dst01 = vdup_n_u32(0); 416 uint32x2_t dst23 = vdup_n_u32(0); 417 418 // Load the source pixels. 419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); 420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); 421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); 422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); 423 424 { 425 // Convert to 16b. 426 const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01)); 427 const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23)); 428 429 // Descale with rounding. 430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 432 // Add the inverse transform. 433 SaturateAndStore4x4(dst, out01, out23); 434 } 435 } 436 437 //----------------------------------------------------------------------------- 438 // Simple In-loop filtering (Paragraph 15.2) 439 440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, 441 const uint8x16_t q0, const uint8x16_t q1, 442 int thresh) { 443 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh); 444 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0) 445 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1) 446 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0) 447 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2 448 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2); 449 const uint8x16_t mask = vcgeq_u8(thresh_v, sum); 450 return mask; 451 } 452 453 static int8x16_t FlipSign(const uint8x16_t v) { 454 const uint8x16_t sign_bit = vdupq_n_u8(0x80); 455 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit)); 456 } 457 458 static uint8x16_t FlipSignBack(const int8x16_t v) { 459 const int8x16_t sign_bit = vdupq_n_s8(0x80); 460 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit)); 461 } 462 463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, 464 const int8x16_t q0, const int8x16_t q1) { 465 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 466 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1) 467 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0) 468 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0) 469 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0) 470 return s3; 471 } 472 473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { 474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) 476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) 477 return s2; 478 } 479 480 //------------------------------------------------------------------------------ 481 482 static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, 483 const int8x16_t delta, 484 int8x16_t* const op0, int8x16_t* const oq0) { 485 const int8x16_t kCst3 = vdupq_n_s8(0x03); 486 const int8x16_t kCst4 = vdupq_n_s8(0x04); 487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); 488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); 489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); 490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); 491 *op0 = vqaddq_s8(p0s, delta3); 492 *oq0 = vqsubq_s8(q0s, delta4); 493 } 494 495 #if defined(WEBP_USE_INTRINSICS) 496 497 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, 498 const int8x16_t delta, 499 uint8x16_t* const op0, uint8x16_t* const oq0) { 500 const int8x16_t kCst3 = vdupq_n_s8(0x03); 501 const int8x16_t kCst4 = vdupq_n_s8(0x04); 502 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); 503 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); 504 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); 505 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); 506 const int8x16_t sp0 = vqaddq_s8(p0s, delta3); 507 const int8x16_t sq0 = vqsubq_s8(q0s, delta4); 508 *op0 = FlipSignBack(sp0); 509 *oq0 = FlipSignBack(sq0); 510 } 511 512 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, 513 const uint8x16_t q0, const uint8x16_t q1, 514 const uint8x16_t mask, 515 uint8x16_t* const op0, uint8x16_t* const oq0) { 516 const int8x16_t p1s = FlipSign(p1); 517 const int8x16_t p0s = FlipSign(p0); 518 const int8x16_t q0s = FlipSign(q0); 519 const int8x16_t q1s = FlipSign(q1); 520 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); 521 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); 522 ApplyFilter2(p0s, q0s, delta1, op0, oq0); 523 } 524 525 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { 526 uint8x16_t p1, p0, q0, q1, op0, oq0; 527 Load16x4(p, stride, &p1, &p0, &q0, &q1); 528 { 529 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); 530 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); 531 } 532 Store16x2(op0, oq0, p, stride); 533 } 534 535 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { 536 uint8x16_t p1, p0, q0, q1, oq0, op0; 537 Load4x16(p, stride, &p1, &p0, &q0, &q1); 538 { 539 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); 540 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); 541 } 542 Store2x16(op0, oq0, p, stride); 543 } 544 545 #else 546 547 #define QRegs "q0", "q1", "q2", "q3", \ 548 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 549 550 #define FLIP_SIGN_BIT2(a, b, s) \ 551 "veor " #a "," #a "," #s " \n" \ 552 "veor " #b "," #b "," #s " \n" \ 553 554 #define FLIP_SIGN_BIT4(a, b, c, d, s) \ 555 FLIP_SIGN_BIT2(a, b, s) \ 556 FLIP_SIGN_BIT2(c, d, s) \ 557 558 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ 559 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ 560 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ 561 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ 562 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ 563 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 564 "vdup.8 q14, " #thresh " \n" \ 565 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ 566 567 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \ 568 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ 569 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ 570 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ 571 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ 572 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ 573 574 #define DO_SIMPLE_FILTER(p0, q0, fl) \ 575 "vmov.i8 q15, #0x03 \n" \ 576 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ 577 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ 578 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ 579 \ 580 "vmov.i8 q15, #0x04 \n" \ 581 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ 582 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ 583 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ 584 585 // Applies filter on 2 pixels (p0 and q0) 586 #define DO_FILTER2(p1, p0, q0, q1, thresh) \ 587 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ 588 "vmov.i8 q10, #0x80 \n" /* sign bit */ \ 589 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ 590 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ 591 "vand q9, q9, q11 \n" /* apply filter mask */ \ 592 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ 593 FLIP_SIGN_BIT2(p0, q0, q10) 594 595 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { 596 __asm__ volatile ( 597 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 598 599 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 600 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 601 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 602 "vld1.u8 {q12}, [%[p]] \n" // q1 603 604 DO_FILTER2(q1, q2, q3, q12, %[thresh]) 605 606 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 607 608 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 609 "vst1.u8 {q3}, [%[p]] \n" // store oq0 610 : [p] "+r"(p) 611 : [stride] "r"(stride), [thresh] "r"(thresh) 612 : "memory", QRegs 613 ); 614 } 615 616 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { 617 __asm__ volatile ( 618 "sub r4, %[p], #2 \n" // base1 = p - 2 619 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 620 "add r5, r4, %[stride] \n" // base2 = base1 + stride 621 622 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 623 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6) 624 "vswp d3, d24 \n" // p1:q1 p0:q3 625 "vswp d5, d26 \n" // q0:q2 q1:q4 626 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4 627 628 DO_FILTER2(q1, q2, q12, q13, %[thresh]) 629 630 "sub %[p], %[p], #1 \n" // p - 1 631 632 "vswp d5, d24 \n" 633 STORE8x2(d4, d5, [%[p]], %[stride]) 634 STORE8x2(d24, d25, [%[p]], %[stride]) 635 636 : [p] "+r"(p) 637 : [stride] "r"(stride), [thresh] "r"(thresh) 638 : "memory", "r4", "r5", "r6", QRegs 639 ); 640 } 641 642 #endif // WEBP_USE_INTRINSICS 643 644 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { 645 uint32_t k; 646 for (k = 3; k != 0; --k) { 647 p += 4 * stride; 648 SimpleVFilter16(p, stride, thresh); 649 } 650 } 651 652 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { 653 uint32_t k; 654 for (k = 3; k != 0; --k) { 655 p += 4; 656 SimpleHFilter16(p, stride, thresh); 657 } 658 } 659 660 //------------------------------------------------------------------------------ 661 // Complex In-loop filtering (Paragraph 15.3) 662 663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, 664 const uint8x16_t q0, const uint8x16_t q1, 665 int hev_thresh) { 666 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); 667 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 668 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 669 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0); 670 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v); 671 return mask; 672 } 673 674 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, 675 const uint8x16_t p1, const uint8x16_t p0, 676 const uint8x16_t q0, const uint8x16_t q1, 677 const uint8x16_t q2, const uint8x16_t q3, 678 int ithresh, int thresh) { 679 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); 680 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) 681 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1) 682 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 683 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2) 684 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1) 685 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 686 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1); 687 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2); 688 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0); 689 const uint8x16_t max12 = vmaxq_u8(max1, max2); 690 const uint8x16_t max123 = vmaxq_u8(max12, max3); 691 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123); 692 const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh); 693 const uint8x16_t mask = vandq_u8(mask1, mask2); 694 return mask; 695 } 696 697 // 4-points filter 698 699 static void ApplyFilter4( 700 const int8x16_t p1, const int8x16_t p0, 701 const int8x16_t q0, const int8x16_t q1, 702 const int8x16_t delta0, 703 uint8x16_t* const op1, uint8x16_t* const op0, 704 uint8x16_t* const oq0, uint8x16_t* const oq1) { 705 const int8x16_t kCst3 = vdupq_n_s8(0x03); 706 const int8x16_t kCst4 = vdupq_n_s8(0x04); 707 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4); 708 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3); 709 const int8x16_t a1 = vshrq_n_s8(delta1, 3); 710 const int8x16_t a2 = vshrq_n_s8(delta2, 3); 711 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1 712 *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2) 713 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1) 714 *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3) 715 *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3) 716 } 717 718 static void DoFilter4( 719 const uint8x16_t p1, const uint8x16_t p0, 720 const uint8x16_t q0, const uint8x16_t q1, 721 const uint8x16_t mask, const uint8x16_t hev_mask, 722 uint8x16_t* const op1, uint8x16_t* const op0, 723 uint8x16_t* const oq0, uint8x16_t* const oq1) { 724 // This is a fused version of DoFilter2() calling ApplyFilter2 directly 725 const int8x16_t p1s = FlipSign(p1); 726 int8x16_t p0s = FlipSign(p0); 727 int8x16_t q0s = FlipSign(q0); 728 const int8x16_t q1s = FlipSign(q1); 729 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 730 731 // do_filter2 part (simple loopfilter on pixels with hev) 732 { 733 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); 734 const int8x16_t simple_lf_delta = 735 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); 736 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); 737 } 738 739 // do_filter4 part (complex loopfilter on pixels without hev) 740 { 741 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); 742 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 743 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 744 const int8x16_t complex_lf_delta = 745 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 746 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); 747 } 748 } 749 750 // 6-points filter 751 752 static void ApplyFilter6( 753 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, 754 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, 755 const int8x16_t delta, 756 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 757 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 758 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7 759 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used 760 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction: 761 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7 762 const int8x8_t delta_lo = vget_low_s8(delta); 763 const int8x8_t delta_hi = vget_high_s8(delta); 764 const int8x8_t kCst9 = vdup_n_s8(9); 765 const int16x8_t kCstm1 = vdupq_n_s16(-1); 766 const int8x8_t kCst18 = vdup_n_s8(18); 767 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1 768 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi); 769 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a 770 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi); 771 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7 772 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7); 773 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6 774 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6); 775 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7 776 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7); 777 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); 778 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); 779 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); 780 781 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) 782 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) 783 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) 784 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) 785 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) 786 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) 787 } 788 789 static void DoFilter6( 790 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, 791 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, 792 const uint8x16_t mask, const uint8x16_t hev_mask, 793 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 794 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 795 // This is a fused version of DoFilter2() calling ApplyFilter2 directly 796 const int8x16_t p2s = FlipSign(p2); 797 const int8x16_t p1s = FlipSign(p1); 798 int8x16_t p0s = FlipSign(p0); 799 int8x16_t q0s = FlipSign(q0); 800 const int8x16_t q1s = FlipSign(q1); 801 const int8x16_t q2s = FlipSign(q2); 802 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 803 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); 804 805 // do_filter2 part (simple loopfilter on pixels with hev) 806 { 807 const int8x16_t simple_lf_delta = 808 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); 809 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); 810 } 811 812 // do_filter6 part (complex loopfilter on pixels without hev) 813 { 814 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 815 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 816 const int8x16_t complex_lf_delta = 817 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 818 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, 819 op2, op1, op0, oq0, oq1, oq2); 820 } 821 } 822 823 // on macroblock edges 824 825 static void VFilter16(uint8_t* p, int stride, 826 int thresh, int ithresh, int hev_thresh) { 827 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 828 Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 829 { 830 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 831 ithresh, thresh); 832 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 833 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 834 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 835 &op2, &op1, &op0, &oq0, &oq1, &oq2); 836 Store16x2(op2, op1, p - 2 * stride, stride); 837 Store16x2(op0, oq0, p + 0 * stride, stride); 838 Store16x2(oq1, oq2, p + 2 * stride, stride); 839 } 840 } 841 842 static void HFilter16(uint8_t* p, int stride, 843 int thresh, int ithresh, int hev_thresh) { 844 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 845 Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 846 { 847 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 848 ithresh, thresh); 849 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 850 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 851 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 852 &op2, &op1, &op0, &oq0, &oq1, &oq2); 853 Store2x16(op2, op1, p - 2, stride); 854 Store2x16(op0, oq0, p + 0, stride); 855 Store2x16(oq1, oq2, p + 2, stride); 856 } 857 } 858 859 // on three inner edges 860 static void VFilter16i(uint8_t* p, int stride, 861 int thresh, int ithresh, int hev_thresh) { 862 uint32_t k; 863 uint8x16_t p3, p2, p1, p0; 864 Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0); 865 for (k = 3; k != 0; --k) { 866 uint8x16_t q0, q1, q2, q3; 867 p += 4 * stride; 868 Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3); 869 { 870 const uint8x16_t mask = 871 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); 872 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 873 // p3 and p2 are not just temporary variables here: they will be 874 // re-used for next span. And q2/q3 will become p1/p0 accordingly. 875 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); 876 Store16x4(p1, p0, p3, p2, p, stride); 877 p1 = q2; 878 p0 = q3; 879 } 880 } 881 } 882 883 #if !defined(WORK_AROUND_GCC) 884 static void HFilter16i(uint8_t* p, int stride, 885 int thresh, int ithresh, int hev_thresh) { 886 uint32_t k; 887 uint8x16_t p3, p2, p1, p0; 888 Load4x16(p + 2, stride, &p3, &p2, &p1, &p0); 889 for (k = 3; k != 0; --k) { 890 uint8x16_t q0, q1, q2, q3; 891 p += 4; 892 Load4x16(p + 2, stride, &q0, &q1, &q2, &q3); 893 { 894 const uint8x16_t mask = 895 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); 896 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 897 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); 898 Store4x16(p1, p0, p3, p2, p, stride); 899 p1 = q2; 900 p0 = q3; 901 } 902 } 903 } 904 #endif // !WORK_AROUND_GCC 905 906 // 8-pixels wide variant, for chroma filtering 907 static void VFilter8(uint8_t* u, uint8_t* v, int stride, 908 int thresh, int ithresh, int hev_thresh) { 909 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 910 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 911 { 912 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 913 ithresh, thresh); 914 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 915 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 916 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 917 &op2, &op1, &op0, &oq0, &oq1, &oq2); 918 Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride); 919 Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride); 920 Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); 921 } 922 } 923 static void VFilter8i(uint8_t* u, uint8_t* v, int stride, 924 int thresh, int ithresh, int hev_thresh) { 925 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 926 u += 4 * stride; 927 v += 4 * stride; 928 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 929 { 930 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 931 ithresh, thresh); 932 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 933 uint8x16_t op1, op0, oq0, oq1; 934 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); 935 Store8x4x2(op1, op0, oq0, oq1, u, v, stride); 936 } 937 } 938 939 #if !defined(WORK_AROUND_GCC) 940 static void HFilter8(uint8_t* u, uint8_t* v, int stride, 941 int thresh, int ithresh, int hev_thresh) { 942 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 943 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 944 { 945 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 946 ithresh, thresh); 947 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 948 uint8x16_t op2, op1, op0, oq0, oq1, oq2; 949 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 950 &op2, &op1, &op0, &oq0, &oq1, &oq2); 951 Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride); 952 } 953 } 954 955 static void HFilter8i(uint8_t* u, uint8_t* v, int stride, 956 int thresh, int ithresh, int hev_thresh) { 957 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 958 u += 4; 959 v += 4; 960 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 961 { 962 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 963 ithresh, thresh); 964 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 965 uint8x16_t op1, op0, oq0, oq1; 966 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); 967 Store4x8x2(op1, op0, oq0, oq1, u, v, stride); 968 } 969 } 970 #endif // !WORK_AROUND_GCC 971 972 //----------------------------------------------------------------------------- 973 // Inverse transforms (Paragraph 14.4) 974 975 // Technically these are unsigned but vqdmulh is only available in signed. 976 // vqdmulh returns high half (effectively >> 16) but also doubles the value, 977 // changing the >> 16 to >> 15 and requiring an additional >> 1. 978 // We use this to our advantage with kC2. The canonical value is 35468. 979 // However, the high bit is set so treating it as signed will give incorrect 980 // results. We avoid this by down shifting by 1 here to clear the highest bit. 981 // Combined with the doubling effect of vqdmulh we get >> 16. 982 // This can not be applied to kC1 because the lowest bit is set. Down shifting 983 // the constant would reduce precision. 984 985 // libwebp uses a trick to avoid some extra addition that libvpx does. 986 // Instead of: 987 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 988 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 989 // same issue with kC1 and vqdmulh that we work around by down shifting kC2 990 991 static const int16_t kC1 = 20091; 992 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 993 994 #if defined(WEBP_USE_INTRINSICS) 995 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, 996 int16x8x2_t* const out) { 997 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 998 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 999 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 1000 // b0 d0 b1 d1 b2 d2 ... 1001 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 1002 } 1003 1004 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { 1005 // {rows} = in0 | in4 1006 // in8 | in12 1007 // B1 = in4 | in12 1008 const int16x8_t B1 = 1009 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); 1010 // C0 = kC1 * in4 | kC1 * in12 1011 // C1 = kC2 * in4 | kC2 * in12 1012 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); 1013 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); 1014 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), 1015 vget_low_s16(rows->val[1])); // in0 + in8 1016 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), 1017 vget_low_s16(rows->val[1])); // in0 - in8 1018 // c = kC2 * in4 - kC1 * in12 1019 // d = kC1 * in4 + kC2 * in12 1020 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); 1021 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); 1022 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b 1023 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c 1024 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c 1025 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c 1026 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); 1027 Transpose8x2(E0, E1, rows); 1028 } 1029 1030 static void TransformOne(const int16_t* in, uint8_t* dst) { 1031 int16x8x2_t rows; 1032 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); 1033 TransformPass(&rows); 1034 TransformPass(&rows); 1035 Add4x4(rows.val[0], rows.val[1], dst); 1036 } 1037 1038 #else 1039 1040 static void TransformOne(const int16_t* in, uint8_t* dst) { 1041 const int kBPS = BPS; 1042 // kC1, kC2. Padded because vld1.16 loads 8 bytes 1043 const int16_t constants[4] = { kC1, kC2, 0, 0 }; 1044 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */ 1045 __asm__ volatile ( 1046 "vld1.16 {q1, q2}, [%[in]] \n" 1047 "vld1.16 {d0}, [%[constants]] \n" 1048 1049 /* d2: in[0] 1050 * d3: in[8] 1051 * d4: in[4] 1052 * d5: in[12] 1053 */ 1054 "vswp d3, d4 \n" 1055 1056 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16 1057 * q9 = {in[4], in[12]} * kC2 >> 16 1058 */ 1059 "vqdmulh.s16 q8, q2, d0[0] \n" 1060 "vqdmulh.s16 q9, q2, d0[1] \n" 1061 1062 /* d22 = a = in[0] + in[8] 1063 * d23 = b = in[0] - in[8] 1064 */ 1065 "vqadd.s16 d22, d2, d3 \n" 1066 "vqsub.s16 d23, d2, d3 \n" 1067 1068 /* The multiplication should be x * kC1 >> 16 1069 * However, with vqdmulh we get x * kC1 * 2 >> 16 1070 * (multiply, double, return high half) 1071 * We avoided this in kC2 by pre-shifting the constant. 1072 * q8 = in[4]/[12] * kC1 >> 16 1073 */ 1074 "vshr.s16 q8, q8, #1 \n" 1075 1076 /* Add {in[4], in[12]} back after the multiplication. This is handled by 1077 * adding 1 << 16 to kC1 in the libwebp C code. 1078 */ 1079 "vqadd.s16 q8, q2, q8 \n" 1080 1081 /* d20 = c = in[4]*kC2 - in[12]*kC1 1082 * d21 = d = in[4]*kC1 + in[12]*kC2 1083 */ 1084 "vqsub.s16 d20, d18, d17 \n" 1085 "vqadd.s16 d21, d19, d16 \n" 1086 1087 /* d2 = tmp[0] = a + d 1088 * d3 = tmp[1] = b + c 1089 * d4 = tmp[2] = b - c 1090 * d5 = tmp[3] = a - d 1091 */ 1092 "vqadd.s16 d2, d22, d21 \n" 1093 "vqadd.s16 d3, d23, d20 \n" 1094 "vqsub.s16 d4, d23, d20 \n" 1095 "vqsub.s16 d5, d22, d21 \n" 1096 1097 "vzip.16 q1, q2 \n" 1098 "vzip.16 q1, q2 \n" 1099 1100 "vswp d3, d4 \n" 1101 1102 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 1103 * q9 = {tmp[4], tmp[12]} * kC2 >> 16 1104 */ 1105 "vqdmulh.s16 q8, q2, d0[0] \n" 1106 "vqdmulh.s16 q9, q2, d0[1] \n" 1107 1108 /* d22 = a = tmp[0] + tmp[8] 1109 * d23 = b = tmp[0] - tmp[8] 1110 */ 1111 "vqadd.s16 d22, d2, d3 \n" 1112 "vqsub.s16 d23, d2, d3 \n" 1113 1114 /* See long winded explanations prior */ 1115 "vshr.s16 q8, q8, #1 \n" 1116 "vqadd.s16 q8, q2, q8 \n" 1117 1118 /* d20 = c = in[4]*kC2 - in[12]*kC1 1119 * d21 = d = in[4]*kC1 + in[12]*kC2 1120 */ 1121 "vqsub.s16 d20, d18, d17 \n" 1122 "vqadd.s16 d21, d19, d16 \n" 1123 1124 /* d2 = tmp[0] = a + d 1125 * d3 = tmp[1] = b + c 1126 * d4 = tmp[2] = b - c 1127 * d5 = tmp[3] = a - d 1128 */ 1129 "vqadd.s16 d2, d22, d21 \n" 1130 "vqadd.s16 d3, d23, d20 \n" 1131 "vqsub.s16 d4, d23, d20 \n" 1132 "vqsub.s16 d5, d22, d21 \n" 1133 1134 "vld1.32 d6[0], [%[dst]], %[kBPS] \n" 1135 "vld1.32 d6[1], [%[dst]], %[kBPS] \n" 1136 "vld1.32 d7[0], [%[dst]], %[kBPS] \n" 1137 "vld1.32 d7[1], [%[dst]], %[kBPS] \n" 1138 1139 "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 1140 1141 /* (val) + 4 >> 3 */ 1142 "vrshr.s16 d2, d2, #3 \n" 1143 "vrshr.s16 d3, d3, #3 \n" 1144 "vrshr.s16 d4, d4, #3 \n" 1145 "vrshr.s16 d5, d5, #3 \n" 1146 1147 "vzip.16 q1, q2 \n" 1148 "vzip.16 q1, q2 \n" 1149 1150 /* Must accumulate before saturating */ 1151 "vmovl.u8 q8, d6 \n" 1152 "vmovl.u8 q9, d7 \n" 1153 1154 "vqadd.s16 q1, q1, q8 \n" 1155 "vqadd.s16 q2, q2, q9 \n" 1156 1157 "vqmovun.s16 d0, q1 \n" 1158 "vqmovun.s16 d1, q2 \n" 1159 1160 "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 1161 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 1162 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 1163 "vst1.32 d1[1], [%[dst]] \n" 1164 1165 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 1166 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 1167 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 1168 ); 1169 } 1170 1171 #endif // WEBP_USE_INTRINSICS 1172 1173 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { 1174 TransformOne(in, dst); 1175 if (do_two) { 1176 TransformOne(in + 16, dst + 4); 1177 } 1178 } 1179 1180 static void TransformDC(const int16_t* in, uint8_t* dst) { 1181 const int16x8_t DC = vdupq_n_s16(in[0]); 1182 Add4x4(DC, DC, dst); 1183 } 1184 1185 //------------------------------------------------------------------------------ 1186 1187 #define STORE_WHT(dst, col, rows) do { \ 1188 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \ 1189 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \ 1190 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \ 1191 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ 1192 } while (0) 1193 1194 static void TransformWHT(const int16_t* in, int16_t* out) { 1195 int32x4x4_t tmp; 1196 1197 { 1198 // Load the source. 1199 const int16x4_t in00_03 = vld1_s16(in + 0); 1200 const int16x4_t in04_07 = vld1_s16(in + 4); 1201 const int16x4_t in08_11 = vld1_s16(in + 8); 1202 const int16x4_t in12_15 = vld1_s16(in + 12); 1203 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15] 1204 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11] 1205 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11] 1206 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15] 1207 tmp.val[0] = vaddq_s32(a0, a1); 1208 tmp.val[1] = vaddq_s32(a3, a2); 1209 tmp.val[2] = vsubq_s32(a0, a1); 1210 tmp.val[3] = vsubq_s32(a3, a2); 1211 // Arrange the temporary results column-wise. 1212 tmp = Transpose4x4(tmp); 1213 } 1214 1215 { 1216 const int32x4_t kCst3 = vdupq_n_s32(3); 1217 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder 1218 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]); 1219 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]); 1220 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]); 1221 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]); 1222 1223 tmp.val[0] = vaddq_s32(a0, a1); 1224 tmp.val[1] = vaddq_s32(a3, a2); 1225 tmp.val[2] = vsubq_s32(a0, a1); 1226 tmp.val[3] = vsubq_s32(a3, a2); 1227 1228 // right shift the results by 3. 1229 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3); 1230 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3); 1231 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3); 1232 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3); 1233 1234 STORE_WHT(out, 0, tmp); 1235 STORE_WHT(out, 1, tmp); 1236 STORE_WHT(out, 2, tmp); 1237 STORE_WHT(out, 3, tmp); 1238 } 1239 } 1240 1241 #undef STORE_WHT 1242 1243 //------------------------------------------------------------------------------ 1244 1245 #define MUL(a, b) (((a) * (b)) >> 16) 1246 static void TransformAC3(const int16_t* in, uint8_t* dst) { 1247 static const int kC1_full = 20091 + (1 << 16); 1248 static const int kC2_full = 35468; 1249 const int16x4_t A = vld1_dup_s16(in); 1250 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); 1251 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); 1252 const int c1 = MUL(in[1], kC2_full); 1253 const int d1 = MUL(in[1], kC1_full); 1254 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | 1255 (uint64_t)( c1 & 0xffff) << 16 | 1256 (uint64_t)(-c1 & 0xffff) << 32 | 1257 (uint64_t)(-d1 & 0xffff) << 48; 1258 const int16x4_t CD = vcreate_s16(cd); 1259 const int16x4_t B = vqadd_s16(A, CD); 1260 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); 1261 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); 1262 Add4x4(m0_m1, m2_m3, dst); 1263 } 1264 #undef MUL 1265 1266 //------------------------------------------------------------------------------ 1267 // 4x4 1268 1269 static void DC4(uint8_t* dst) { // DC 1270 const uint8x8_t A = vld1_u8(dst - BPS); // top row 1271 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top 1272 const uint16x4_t p1 = vpadd_u16(p0, p0); 1273 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); 1274 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); 1275 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); 1276 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); 1277 const uint16x8_t s0 = vaddq_u16(L0, L1); 1278 const uint16x8_t s1 = vaddq_u16(L2, L3); 1279 const uint16x8_t s01 = vaddq_u16(s0, s1); 1280 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); 1281 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3 1282 const uint8x8_t dc = vdup_lane_u8(dc0, 0); 1283 int i; 1284 for (i = 0; i < 4; ++i) { 1285 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0); 1286 } 1287 } 1288 1289 // TrueMotion (4x4 + 8x8) 1290 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { 1291 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' 1292 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' 1293 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] 1294 int y; 1295 for (y = 0; y < size; y += 4) { 1296 // left edge 1297 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); 1298 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); 1299 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); 1300 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); 1301 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] 1302 const int16x8_t r1 = vaddq_s16(L1, d); 1303 const int16x8_t r2 = vaddq_s16(L2, d); 1304 const int16x8_t r3 = vaddq_s16(L3, d); 1305 // Saturate and store the result. 1306 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); 1307 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); 1308 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); 1309 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); 1310 if (size == 4) { 1311 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); 1312 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); 1313 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); 1314 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); 1315 } else { 1316 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32); 1317 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32); 1318 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32); 1319 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32); 1320 } 1321 dst += 4 * BPS; 1322 } 1323 } 1324 1325 static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } 1326 1327 static void VE4(uint8_t* dst) { // vertical 1328 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. 1329 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row 1330 const uint64x1_t A1 = vshr_n_u64(A0, 8); 1331 const uint64x1_t A2 = vshr_n_u64(A0, 16); 1332 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); 1333 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); 1334 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); 1335 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00); 1336 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0); 1337 int i; 1338 for (i = 0; i < 4; ++i) { 1339 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0); 1340 } 1341 } 1342 1343 static void RD4(uint8_t* dst) { // Down-right 1344 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); 1345 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); 1346 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); 1347 const uint32_t I = dst[-1 + 0 * BPS]; 1348 const uint32_t J = dst[-1 + 1 * BPS]; 1349 const uint32_t K = dst[-1 + 2 * BPS]; 1350 const uint32_t L = dst[-1 + 3 * BPS]; 1351 const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24)); 1352 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); 1353 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); 1354 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); 1355 const uint8_t D = vget_lane_u8(XABCD_u8, 4); 1356 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); 1357 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); 1358 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); 1359 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); 1360 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); 1361 const uint32x2_t r3 = vreinterpret_u32_u8(avg2); 1362 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); 1363 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); 1364 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); 1365 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); 1366 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); 1367 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); 1368 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); 1369 } 1370 1371 static void LD4(uint8_t* dst) { // Down-left 1372 // Note using the same shift trick as VE4() is slower here. 1373 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); 1374 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); 1375 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2); 1376 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6); 1377 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0); 1378 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); 1379 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); 1380 const uint32x2_t r0 = vreinterpret_u32_u8(avg2); 1381 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); 1382 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); 1383 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); 1384 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); 1385 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); 1386 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); 1387 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); 1388 } 1389 1390 //------------------------------------------------------------------------------ 1391 // Chroma 1392 1393 static void VE8uv(uint8_t* dst) { // vertical 1394 const uint8x8_t top = vld1_u8(dst - BPS); 1395 int j; 1396 for (j = 0; j < 8; ++j) { 1397 vst1_u8(dst + j * BPS, top); 1398 } 1399 } 1400 1401 static void HE8uv(uint8_t* dst) { // horizontal 1402 int j; 1403 for (j = 0; j < 8; ++j) { 1404 const uint8x8_t left = vld1_dup_u8(dst - 1); 1405 vst1_u8(dst, left); 1406 dst += BPS; 1407 } 1408 } 1409 1410 static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { 1411 uint16x8_t sum_top; 1412 uint16x8_t sum_left; 1413 uint8x8_t dc0; 1414 1415 if (do_top) { 1416 const uint8x8_t A = vld1_u8(dst - BPS); // top row 1417 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top 1418 const uint16x4_t p1 = vpadd_u16(p0, p0); 1419 const uint16x4_t p2 = vpadd_u16(p1, p1); 1420 sum_top = vcombine_u16(p2, p2); 1421 } 1422 1423 if (do_left) { 1424 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); 1425 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); 1426 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); 1427 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); 1428 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); 1429 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); 1430 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); 1431 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); 1432 const uint16x8_t s0 = vaddq_u16(L0, L1); 1433 const uint16x8_t s1 = vaddq_u16(L2, L3); 1434 const uint16x8_t s2 = vaddq_u16(L4, L5); 1435 const uint16x8_t s3 = vaddq_u16(L6, L7); 1436 const uint16x8_t s01 = vaddq_u16(s0, s1); 1437 const uint16x8_t s23 = vaddq_u16(s2, s3); 1438 sum_left = vaddq_u16(s01, s23); 1439 } 1440 1441 if (do_top && do_left) { 1442 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 1443 dc0 = vrshrn_n_u16(sum, 4); 1444 } else if (do_top) { 1445 dc0 = vrshrn_n_u16(sum_top, 3); 1446 } else if (do_left) { 1447 dc0 = vrshrn_n_u16(sum_left, 3); 1448 } else { 1449 dc0 = vdup_n_u8(0x80); 1450 } 1451 1452 { 1453 const uint8x8_t dc = vdup_lane_u8(dc0, 0); 1454 int i; 1455 for (i = 0; i < 8; ++i) { 1456 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc)); 1457 } 1458 } 1459 } 1460 1461 static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); } 1462 static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); } 1463 static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); } 1464 static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); } 1465 1466 static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } 1467 1468 //------------------------------------------------------------------------------ 1469 // 16x16 1470 1471 static void VE16(uint8_t* dst) { // vertical 1472 const uint8x16_t top = vld1q_u8(dst - BPS); 1473 int j; 1474 for (j = 0; j < 16; ++j) { 1475 vst1q_u8(dst + j * BPS, top); 1476 } 1477 } 1478 1479 static void HE16(uint8_t* dst) { // horizontal 1480 int j; 1481 for (j = 0; j < 16; ++j) { 1482 const uint8x16_t left = vld1q_dup_u8(dst - 1); 1483 vst1q_u8(dst, left); 1484 dst += BPS; 1485 } 1486 } 1487 1488 static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { 1489 uint16x8_t sum_top; 1490 uint16x8_t sum_left; 1491 uint8x8_t dc0; 1492 1493 if (do_top) { 1494 const uint8x16_t A = vld1q_u8(dst - BPS); // top row 1495 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top 1496 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); 1497 const uint16x4_t p2 = vpadd_u16(p1, p1); 1498 const uint16x4_t p3 = vpadd_u16(p2, p2); 1499 sum_top = vcombine_u16(p3, p3); 1500 } 1501 1502 if (do_left) { 1503 int i; 1504 sum_left = vdupq_n_u16(0); 1505 for (i = 0; i < 16; i += 8) { 1506 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); 1507 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); 1508 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); 1509 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); 1510 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); 1511 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); 1512 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); 1513 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); 1514 const uint16x8_t s0 = vaddq_u16(L0, L1); 1515 const uint16x8_t s1 = vaddq_u16(L2, L3); 1516 const uint16x8_t s2 = vaddq_u16(L4, L5); 1517 const uint16x8_t s3 = vaddq_u16(L6, L7); 1518 const uint16x8_t s01 = vaddq_u16(s0, s1); 1519 const uint16x8_t s23 = vaddq_u16(s2, s3); 1520 const uint16x8_t sum = vaddq_u16(s01, s23); 1521 sum_left = vaddq_u16(sum_left, sum); 1522 } 1523 } 1524 1525 if (do_top && do_left) { 1526 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 1527 dc0 = vrshrn_n_u16(sum, 5); 1528 } else if (do_top) { 1529 dc0 = vrshrn_n_u16(sum_top, 4); 1530 } else if (do_left) { 1531 dc0 = vrshrn_n_u16(sum_left, 4); 1532 } else { 1533 dc0 = vdup_n_u8(0x80); 1534 } 1535 1536 { 1537 const uint8x16_t dc = vdupq_lane_u8(dc0, 0); 1538 int i; 1539 for (i = 0; i < 16; ++i) { 1540 vst1q_u8(dst + i * BPS, dc); 1541 } 1542 } 1543 } 1544 1545 static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); } 1546 static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); } 1547 static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); } 1548 static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); } 1549 1550 static void TM16(uint8_t* dst) { 1551 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' 1552 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' 1553 // A[c] - A[-1] 1554 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL)); 1555 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL)); 1556 int y; 1557 for (y = 0; y < 16; y += 4) { 1558 // left edge 1559 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); 1560 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); 1561 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); 1562 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); 1563 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] 1564 const int16x8_t r1_lo = vaddq_s16(L1, d_lo); 1565 const int16x8_t r2_lo = vaddq_s16(L2, d_lo); 1566 const int16x8_t r3_lo = vaddq_s16(L3, d_lo); 1567 const int16x8_t r0_hi = vaddq_s16(L0, d_hi); 1568 const int16x8_t r1_hi = vaddq_s16(L1, d_hi); 1569 const int16x8_t r2_hi = vaddq_s16(L2, d_hi); 1570 const int16x8_t r3_hi = vaddq_s16(L3, d_hi); 1571 // Saturate and store the result. 1572 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi)); 1573 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi)); 1574 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi)); 1575 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi)); 1576 vst1q_u8(dst + 0 * BPS, row0); 1577 vst1q_u8(dst + 1 * BPS, row1); 1578 vst1q_u8(dst + 2 * BPS, row2); 1579 vst1q_u8(dst + 3 * BPS, row3); 1580 dst += 4 * BPS; 1581 } 1582 } 1583 1584 //------------------------------------------------------------------------------ 1585 // Entry point 1586 1587 extern void VP8DspInitNEON(void); 1588 1589 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { 1590 VP8Transform = TransformTwo; 1591 VP8TransformAC3 = TransformAC3; 1592 VP8TransformDC = TransformDC; 1593 VP8TransformWHT = TransformWHT; 1594 1595 VP8VFilter16 = VFilter16; 1596 VP8VFilter16i = VFilter16i; 1597 VP8HFilter16 = HFilter16; 1598 #if !defined(WORK_AROUND_GCC) 1599 VP8HFilter16i = HFilter16i; 1600 #endif 1601 VP8VFilter8 = VFilter8; 1602 VP8VFilter8i = VFilter8i; 1603 #if !defined(WORK_AROUND_GCC) 1604 VP8HFilter8 = HFilter8; 1605 VP8HFilter8i = HFilter8i; 1606 #endif 1607 VP8SimpleVFilter16 = SimpleVFilter16; 1608 VP8SimpleHFilter16 = SimpleHFilter16; 1609 VP8SimpleVFilter16i = SimpleVFilter16i; 1610 VP8SimpleHFilter16i = SimpleHFilter16i; 1611 1612 VP8PredLuma4[0] = DC4; 1613 VP8PredLuma4[1] = TM4; 1614 VP8PredLuma4[2] = VE4; 1615 VP8PredLuma4[4] = RD4; 1616 VP8PredLuma4[6] = LD4; 1617 1618 VP8PredLuma16[0] = DC16TopLeft; 1619 VP8PredLuma16[1] = TM16; 1620 VP8PredLuma16[2] = VE16; 1621 VP8PredLuma16[3] = HE16; 1622 VP8PredLuma16[4] = DC16NoTop; 1623 VP8PredLuma16[5] = DC16NoLeft; 1624 VP8PredLuma16[6] = DC16NoTopLeft; 1625 1626 VP8PredChroma8[0] = DC8uv; 1627 VP8PredChroma8[1] = TM8uv; 1628 VP8PredChroma8[2] = VE8uv; 1629 VP8PredChroma8[3] = HE8uv; 1630 VP8PredChroma8[4] = DC8uvNoTop; 1631 VP8PredChroma8[5] = DC8uvNoLeft; 1632 VP8PredChroma8[6] = DC8uvNoTopLeft; 1633 } 1634 1635 #else // !WEBP_USE_NEON 1636 1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON) 1638 1639 #endif // WEBP_USE_NEON 1640