1 // Copyright 2012 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // ARM NEON version of speed-critical encoding functions. 11 // 12 // adapted from libvpx (http://www.webmproject.org/code/) 13 14 #include "./dsp.h" 15 16 #if defined(WEBP_USE_NEON) 17 18 #include <assert.h> 19 20 #include "./neon.h" 21 #include "../enc/vp8i_enc.h" 22 23 //------------------------------------------------------------------------------ 24 // Transforms (Paragraph 14.4) 25 26 // Inverse transform. 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except 28 // for subtraction to *ref. See the comments there for algorithmic explanations. 29 30 static const int16_t kC1 = 20091; 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 32 33 // This code works but is *slower* than the inlined-asm version below 34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 35 // WEBP_USE_INTRINSICS define. 36 // With gcc-4.8, it's a little faster speed than inlined-assembly. 37 #if defined(WEBP_USE_INTRINSICS) 38 39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. 40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { 41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); 42 } 43 44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 45 // to the corresponding rows of 'dst'. 46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 47 const int16x8_t dst01, 48 const int16x8_t dst23) { 49 // Unsigned saturate to 8b. 50 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 51 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 52 53 // Store the results. 54 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); 55 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); 56 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); 57 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); 58 } 59 60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, 61 const uint8_t* const ref, uint8_t* const dst) { 62 uint32x2_t dst01 = vdup_n_u32(0); 63 uint32x2_t dst23 = vdup_n_u32(0); 64 65 // Load the source pixels. 66 dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0); 67 dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0); 68 dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1); 69 dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1); 70 71 { 72 // Convert to 16b. 73 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); 74 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); 75 76 // Descale with rounding. 77 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 78 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 79 // Add the inverse transform. 80 SaturateAndStore4x4(dst, out01, out23); 81 } 82 } 83 84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, 85 int16x8x2_t* const out) { 86 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 87 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 88 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 89 // b0 d0 b1 d1 b2 d2 ... 90 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 91 } 92 93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { 94 // {rows} = in0 | in4 95 // in8 | in12 96 // B1 = in4 | in12 97 const int16x8_t B1 = 98 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); 99 // C0 = kC1 * in4 | kC1 * in12 100 // C1 = kC2 * in4 | kC2 * in12 101 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); 102 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); 103 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), 104 vget_low_s16(rows->val[1])); // in0 + in8 105 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), 106 vget_low_s16(rows->val[1])); // in0 - in8 107 // c = kC2 * in4 - kC1 * in12 108 // d = kC1 * in4 + kC2 * in12 109 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); 110 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); 111 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b 112 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c 113 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c 114 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c 115 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); 116 Transpose8x2(E0, E1, rows); 117 } 118 119 static void ITransformOne(const uint8_t* ref, 120 const int16_t* in, uint8_t* dst) { 121 int16x8x2_t rows; 122 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); 123 TransformPass(&rows); 124 TransformPass(&rows); 125 Add4x4(rows.val[0], rows.val[1], ref, dst); 126 } 127 128 #else 129 130 static void ITransformOne(const uint8_t* ref, 131 const int16_t* in, uint8_t* dst) { 132 const int kBPS = BPS; 133 const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; 134 135 __asm__ volatile ( 136 "vld1.16 {q1, q2}, [%[in]] \n" 137 "vld1.16 {d0}, [%[kC1C2]] \n" 138 139 // d2: in[0] 140 // d3: in[8] 141 // d4: in[4] 142 // d5: in[12] 143 "vswp d3, d4 \n" 144 145 // q8 = {in[4], in[12]} * kC1 * 2 >> 16 146 // q9 = {in[4], in[12]} * kC2 >> 16 147 "vqdmulh.s16 q8, q2, d0[0] \n" 148 "vqdmulh.s16 q9, q2, d0[1] \n" 149 150 // d22 = a = in[0] + in[8] 151 // d23 = b = in[0] - in[8] 152 "vqadd.s16 d22, d2, d3 \n" 153 "vqsub.s16 d23, d2, d3 \n" 154 155 // q8 = in[4]/[12] * kC1 >> 16 156 "vshr.s16 q8, q8, #1 \n" 157 158 // Add {in[4], in[12]} back after the multiplication. 159 "vqadd.s16 q8, q2, q8 \n" 160 161 // d20 = c = in[4]*kC2 - in[12]*kC1 162 // d21 = d = in[4]*kC1 + in[12]*kC2 163 "vqsub.s16 d20, d18, d17 \n" 164 "vqadd.s16 d21, d19, d16 \n" 165 166 // d2 = tmp[0] = a + d 167 // d3 = tmp[1] = b + c 168 // d4 = tmp[2] = b - c 169 // d5 = tmp[3] = a - d 170 "vqadd.s16 d2, d22, d21 \n" 171 "vqadd.s16 d3, d23, d20 \n" 172 "vqsub.s16 d4, d23, d20 \n" 173 "vqsub.s16 d5, d22, d21 \n" 174 175 "vzip.16 q1, q2 \n" 176 "vzip.16 q1, q2 \n" 177 178 "vswp d3, d4 \n" 179 180 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 181 // q9 = {tmp[4], tmp[12]} * kC2 >> 16 182 "vqdmulh.s16 q8, q2, d0[0] \n" 183 "vqdmulh.s16 q9, q2, d0[1] \n" 184 185 // d22 = a = tmp[0] + tmp[8] 186 // d23 = b = tmp[0] - tmp[8] 187 "vqadd.s16 d22, d2, d3 \n" 188 "vqsub.s16 d23, d2, d3 \n" 189 190 "vshr.s16 q8, q8, #1 \n" 191 "vqadd.s16 q8, q2, q8 \n" 192 193 // d20 = c = in[4]*kC2 - in[12]*kC1 194 // d21 = d = in[4]*kC1 + in[12]*kC2 195 "vqsub.s16 d20, d18, d17 \n" 196 "vqadd.s16 d21, d19, d16 \n" 197 198 // d2 = tmp[0] = a + d 199 // d3 = tmp[1] = b + c 200 // d4 = tmp[2] = b - c 201 // d5 = tmp[3] = a - d 202 "vqadd.s16 d2, d22, d21 \n" 203 "vqadd.s16 d3, d23, d20 \n" 204 "vqsub.s16 d4, d23, d20 \n" 205 "vqsub.s16 d5, d22, d21 \n" 206 207 "vld1.32 d6[0], [%[ref]], %[kBPS] \n" 208 "vld1.32 d6[1], [%[ref]], %[kBPS] \n" 209 "vld1.32 d7[0], [%[ref]], %[kBPS] \n" 210 "vld1.32 d7[1], [%[ref]], %[kBPS] \n" 211 212 "sub %[ref], %[ref], %[kBPS], lsl #2 \n" 213 214 // (val) + 4 >> 3 215 "vrshr.s16 d2, d2, #3 \n" 216 "vrshr.s16 d3, d3, #3 \n" 217 "vrshr.s16 d4, d4, #3 \n" 218 "vrshr.s16 d5, d5, #3 \n" 219 220 "vzip.16 q1, q2 \n" 221 "vzip.16 q1, q2 \n" 222 223 // Must accumulate before saturating 224 "vmovl.u8 q8, d6 \n" 225 "vmovl.u8 q9, d7 \n" 226 227 "vqadd.s16 q1, q1, q8 \n" 228 "vqadd.s16 q2, q2, q9 \n" 229 230 "vqmovun.s16 d0, q1 \n" 231 "vqmovun.s16 d1, q2 \n" 232 233 "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 236 "vst1.32 d1[1], [%[dst]] \n" 237 238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers 239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants 240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered 241 ); 242 } 243 244 #endif // WEBP_USE_INTRINSICS 245 246 static void ITransform(const uint8_t* ref, 247 const int16_t* in, uint8_t* dst, int do_two) { 248 ITransformOne(ref, in, dst); 249 if (do_two) { 250 ITransformOne(ref + 4, in + 16, dst + 4); 251 } 252 } 253 254 // Load all 4x4 pixels into a single uint8x16_t variable. 255 static uint8x16_t Load4x4(const uint8_t* src) { 256 uint32x4_t out = vdupq_n_u32(0); 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); 261 return vreinterpretq_u8_u32(out); 262 } 263 264 // Forward transform. 265 266 #if defined(WEBP_USE_INTRINSICS) 267 268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, 269 const int16x4_t C, const int16x4_t D, 270 int16x8_t* const out01, 271 int16x8_t* const out32) { 272 const int16x4x2_t AB = vtrn_s16(A, B); 273 const int16x4x2_t CD = vtrn_s16(C, D); 274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), 275 vreinterpret_s32_s16(CD.val[0])); 276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), 277 vreinterpret_s32_s16(CD.val[1])); 278 *out01 = vreinterpretq_s16_s64( 279 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]), 280 vreinterpret_s64_s32(tmp13.val[0]))); 281 *out32 = vreinterpretq_s16_s64( 282 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]), 283 vreinterpret_s64_s32(tmp02.val[1]))); 284 } 285 286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a, 287 const uint8x8_t b) { 288 return vreinterpretq_s16_u16(vsubl_u8(a, b)); 289 } 290 291 static void FTransform(const uint8_t* src, const uint8_t* ref, 292 int16_t* out) { 293 int16x8_t d0d1, d3d2; // working 4x4 int16 variables 294 { 295 const uint8x16_t S0 = Load4x4(src); 296 const uint8x16_t R0 = Load4x4(ref); 297 const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0)); 298 const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0)); 299 const int16x4_t D0 = vget_low_s16(D0D1); 300 const int16x4_t D1 = vget_high_s16(D0D1); 301 const int16x4_t D2 = vget_low_s16(D2D3); 302 const int16x4_t D3 = vget_high_s16(D2D3); 303 Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2); 304 } 305 { // 1rst pass 306 const int32x4_t kCst937 = vdupq_n_s32(937); 307 const int32x4_t kCst1812 = vdupq_n_s32(1812); 308 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) 309 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) 310 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3); 311 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2), 312 vget_high_s16(a0a1_2)); 313 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2), 314 vget_high_s16(a0a1_2)); 315 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); 316 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); 317 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); 318 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); 319 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); 320 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); 321 Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); 322 } 323 { // 2nd pass 324 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) 325 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16)); 326 const int32x4_t kCst51000 = vdupq_n_s32(51000); 327 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) 328 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) 329 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7)); 330 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4); 331 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4); 332 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); 333 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); 334 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); 335 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); 336 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000); 337 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000); 338 const int16x4_t a3_eq_0 = 339 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0))); 340 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0); 341 vst1_s16(out + 0, out0); 342 vst1_s16(out + 4, out1); 343 vst1_s16(out + 8, out2); 344 vst1_s16(out + 12, out3); 345 } 346 } 347 348 #else 349 350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm 351 static const int16_t kCoeff16[] = { 352 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 353 }; 354 static const int32_t kCoeff32[] = { 355 1812, 1812, 1812, 1812, 356 937, 937, 937, 937, 357 12000, 12000, 12000, 12000, 358 51000, 51000, 51000, 51000 359 }; 360 361 static void FTransform(const uint8_t* src, const uint8_t* ref, 362 int16_t* out) { 363 const int kBPS = BPS; 364 const uint8_t* src_ptr = src; 365 const uint8_t* ref_ptr = ref; 366 const int16_t* coeff16 = kCoeff16; 367 const int32_t* coeff32 = kCoeff32; 368 369 __asm__ volatile ( 370 // load src into q4, q5 in high half 371 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n" 372 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n" 373 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n" 374 "vld1.8 {d11}, [%[src_ptr]] \n" 375 376 // load ref into q6, q7 in high half 377 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n" 378 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n" 379 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n" 380 "vld1.8 {d15}, [%[ref_ptr]] \n" 381 382 // Pack the high values in to q4 and q6 383 "vtrn.32 q4, q5 \n" 384 "vtrn.32 q6, q7 \n" 385 386 // d[0-3] = src - ref 387 "vsubl.u8 q0, d8, d12 \n" 388 "vsubl.u8 q1, d9, d13 \n" 389 390 // load coeff16 into q8(d16=5352, d17=2217) 391 "vld1.16 {q8}, [%[coeff16]] \n" 392 393 // load coeff32 high half into q9 = 1812, q10 = 937 394 "vld1.32 {q9, q10}, [%[coeff32]]! \n" 395 396 // load coeff32 low half into q11=12000, q12=51000 397 "vld1.32 {q11,q12}, [%[coeff32]] \n" 398 399 // part 1 400 // Transpose. Register dN is the same as dN in C 401 "vtrn.32 d0, d2 \n" 402 "vtrn.32 d1, d3 \n" 403 "vtrn.16 d0, d1 \n" 404 "vtrn.16 d2, d3 \n" 405 406 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 407 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 408 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 409 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 410 411 "vadd.s16 d0, d4, d5 \n" // a0 + a1 412 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 413 "vsub.s16 d2, d4, d5 \n" // a0 - a1 414 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 415 416 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 417 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 418 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 419 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 420 421 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 422 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 423 "vshrn.s32 d1, q9, #9 \n" 424 "vshrn.s32 d3, q10, #9 \n" 425 426 // part 2 427 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] 428 "vtrn.32 d0, d2 \n" 429 "vtrn.32 d1, d3 \n" 430 "vtrn.16 d0, d1 \n" 431 "vtrn.16 d2, d3 \n" 432 433 "vmov.s16 d26, #7 \n" 434 435 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12] 436 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8] 437 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8] 438 "vadd.s16 d4, d4, d26 \n" // a1 + 7 439 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12] 440 441 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7 442 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7 443 444 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000 445 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 446 447 "vceq.s16 d4, d7, #0 \n" 448 449 "vshr.s16 d0, d0, #4 \n" 450 "vshr.s16 d2, d2, #4 \n" 451 452 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 453 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 454 455 "vmvn d4, d4 \n" // !(d1 == 0) 456 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 457 "vshrn.s32 d1, q11, #16 \n" 458 // op[4] += (d1!=0) 459 "vsub.s16 d1, d1, d4 \n" 460 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 461 "vshrn.s32 d3, q12, #16 \n" 462 463 // set result to out array 464 "vst1.16 {q0, q1}, [%[out]] \n" 465 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), 466 [coeff32] "+r"(coeff32) // modified registers 467 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16), 468 [out] "r"(out) // constants 469 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 470 "q10", "q11", "q12", "q13" // clobbered 471 ); 472 } 473 474 #endif 475 476 #define LOAD_LANE_16b(VALUE, LANE) do { \ 477 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \ 478 src += stride; \ 479 } while (0) 480 481 static void FTransformWHT(const int16_t* src, int16_t* out) { 482 const int stride = 16; 483 const int16x4_t zero = vdup_n_s16(0); 484 int32x4x4_t tmp0; 485 int16x4x4_t in; 486 INIT_VECTOR4(in, zero, zero, zero, zero); 487 LOAD_LANE_16b(in.val[0], 0); 488 LOAD_LANE_16b(in.val[1], 0); 489 LOAD_LANE_16b(in.val[2], 0); 490 LOAD_LANE_16b(in.val[3], 0); 491 LOAD_LANE_16b(in.val[0], 1); 492 LOAD_LANE_16b(in.val[1], 1); 493 LOAD_LANE_16b(in.val[2], 1); 494 LOAD_LANE_16b(in.val[3], 1); 495 LOAD_LANE_16b(in.val[0], 2); 496 LOAD_LANE_16b(in.val[1], 2); 497 LOAD_LANE_16b(in.val[2], 2); 498 LOAD_LANE_16b(in.val[3], 2); 499 LOAD_LANE_16b(in.val[0], 3); 500 LOAD_LANE_16b(in.val[1], 3); 501 LOAD_LANE_16b(in.val[2], 3); 502 LOAD_LANE_16b(in.val[3], 3); 503 504 { 505 // a0 = in[0 * 16] + in[2 * 16] 506 // a1 = in[1 * 16] + in[3 * 16] 507 // a2 = in[1 * 16] - in[3 * 16] 508 // a3 = in[0 * 16] - in[2 * 16] 509 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]); 510 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]); 511 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]); 512 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]); 513 tmp0.val[0] = vaddq_s32(a0, a1); 514 tmp0.val[1] = vaddq_s32(a3, a2); 515 tmp0.val[2] = vsubq_s32(a3, a2); 516 tmp0.val[3] = vsubq_s32(a0, a1); 517 } 518 { 519 const int32x4x4_t tmp1 = Transpose4x4(tmp0); 520 // a0 = tmp[0 + i] + tmp[ 8 + i] 521 // a1 = tmp[4 + i] + tmp[12 + i] 522 // a2 = tmp[4 + i] - tmp[12 + i] 523 // a3 = tmp[0 + i] - tmp[ 8 + i] 524 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]); 525 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]); 526 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]); 527 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]); 528 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1 529 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1 530 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1 531 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1 532 const int16x4_t out0 = vmovn_s32(b0); 533 const int16x4_t out1 = vmovn_s32(b1); 534 const int16x4_t out2 = vmovn_s32(b2); 535 const int16x4_t out3 = vmovn_s32(b3); 536 537 vst1_s16(out + 0, out0); 538 vst1_s16(out + 4, out1); 539 vst1_s16(out + 8, out2); 540 vst1_s16(out + 12, out3); 541 } 542 } 543 #undef LOAD_LANE_16b 544 545 //------------------------------------------------------------------------------ 546 // Texture distortion 547 // 548 // We try to match the spectral content (weighted) between source and 549 // reconstructed samples. 550 551 // a 0123, b 0123 552 // a 4567, b 4567 553 // a 89ab, b 89ab 554 // a cdef, b cdef 555 // 556 // transpose 557 // 558 // a 048c, b 048c 559 // a 159d, b 159d 560 // a 26ae, b 26ae 561 // a 37bf, b 37bf 562 // 563 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { 564 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); 565 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); 566 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]), 567 vreinterpretq_s32_s16(q2_tmp1.val[0])); 568 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]), 569 vreinterpretq_s32_s16(q2_tmp1.val[1])); 570 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]); 571 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]); 572 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]); 573 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]); 574 return q4_in; 575 } 576 577 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) { 578 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} 579 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} 580 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); 581 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); 582 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); 583 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); 584 int16x8x4_t q4_out; 585 // tmp[0] = a0 + a1 586 // tmp[1] = a3 + a2 587 // tmp[2] = a3 - a2 588 // tmp[3] = a0 - a1 589 INIT_VECTOR4(q4_out, 590 vabsq_s16(vaddq_s16(q_a0, q_a1)), 591 vabsq_s16(vaddq_s16(q_a3, q_a2)), 592 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1)); 593 return q4_out; 594 } 595 596 static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) { 597 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0], 598 q4_in.val[2])); 599 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1], 600 q4_in.val[3])); 601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1], 602 q4_in.val[3])); 603 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0], 604 q4_in.val[2])); 605 int16x8x4_t q4_out; 606 607 INIT_VECTOR4(q4_out, 608 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), 609 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); 610 return q4_out; 611 } 612 613 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { 614 const uint16x8_t q_w07 = vld1q_u16(&w[0]); 615 const uint16x8_t q_w8f = vld1q_u16(&w[8]); 616 int16x4x4_t d4_w; 617 INIT_VECTOR4(d4_w, 618 vget_low_s16(vreinterpretq_s16_u16(q_w07)), 619 vget_high_s16(vreinterpretq_s16_u16(q_w07)), 620 vget_low_s16(vreinterpretq_s16_u16(q_w8f)), 621 vget_high_s16(vreinterpretq_s16_u16(q_w8f))); 622 return d4_w; 623 } 624 625 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, 626 const int16x4x4_t d4_w) { 627 int32x2_t d_sum; 628 // sum += w[ 0] * abs(b0); 629 // sum += w[ 4] * abs(b1); 630 // sum += w[ 8] * abs(b2); 631 // sum += w[12] * abs(b3); 632 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0])); 633 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1])); 634 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2])); 635 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3])); 636 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0])); 637 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1])); 638 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2])); 639 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3])); 640 641 q_sum0 = vaddq_s32(q_sum0, q_sum1); 642 q_sum2 = vaddq_s32(q_sum2, q_sum3); 643 q_sum2 = vaddq_s32(q_sum0, q_sum2); 644 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2)); 645 d_sum = vpadd_s32(d_sum, d_sum); 646 return d_sum; 647 } 648 649 #define LOAD_LANE_32b(src, VALUE, LANE) \ 650 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) 651 652 // Hadamard transform 653 // Returns the weighted sum of the absolute value of transformed coefficients. 654 // w[] contains a row-major 4 by 4 symmetric matrix. 655 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 656 const uint16_t* const w) { 657 uint32x2_t d_in_ab_0123 = vdup_n_u32(0); 658 uint32x2_t d_in_ab_4567 = vdup_n_u32(0); 659 uint32x2_t d_in_ab_89ab = vdup_n_u32(0); 660 uint32x2_t d_in_ab_cdef = vdup_n_u32(0); 661 uint8x8x4_t d4_in; 662 663 // load data a, b 664 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0); 665 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0); 666 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0); 667 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0); 668 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1); 669 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1); 670 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1); 671 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1); 672 INIT_VECTOR4(d4_in, 673 vreinterpret_u8_u32(d_in_ab_0123), 674 vreinterpret_u8_u32(d_in_ab_4567), 675 vreinterpret_u8_u32(d_in_ab_89ab), 676 vreinterpret_u8_u32(d_in_ab_cdef)); 677 678 { 679 // Vertical pass first to avoid a transpose (vertical and horizontal passes 680 // are commutative because w/kWeightY is symmetric) and subsequent 681 // transpose. 682 const int16x8x4_t q4_v = DistoVerticalPass(d4_in); 683 const int16x4x4_t d4_w = DistoLoadW(w); 684 // horizontal pass 685 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v); 686 const int16x8x4_t q4_h = DistoHorizontalPass(q4_t); 687 int32x2_t d_sum = DistoSum(q4_h, d4_w); 688 689 // abs(sum2 - sum1) >> 5 690 d_sum = vabs_s32(d_sum); 691 d_sum = vshr_n_s32(d_sum, 5); 692 return vget_lane_s32(d_sum, 0); 693 } 694 } 695 #undef LOAD_LANE_32b 696 697 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 698 const uint16_t* const w) { 699 int D = 0; 700 int x, y; 701 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 702 for (x = 0; x < 16; x += 4) { 703 D += Disto4x4(a + x + y, b + x + y, w); 704 } 705 } 706 return D; 707 } 708 709 //------------------------------------------------------------------------------ 710 711 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 712 int start_block, int end_block, 713 VP8Histogram* const histo) { 714 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); 715 int j; 716 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 717 for (j = start_block; j < end_block; ++j) { 718 int16_t out[16]; 719 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 720 { 721 int k; 722 const int16x8_t a0 = vld1q_s16(out + 0); 723 const int16x8_t b0 = vld1q_s16(out + 8); 724 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); 725 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); 726 const uint16x8_t a2 = vshrq_n_u16(a1, 3); 727 const uint16x8_t b2 = vshrq_n_u16(b1, 3); 728 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); 729 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); 730 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); 731 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); 732 // Convert coefficients to bin. 733 for (k = 0; k < 16; ++k) { 734 ++distribution[out[k]]; 735 } 736 } 737 } 738 VP8SetHistogramData(distribution, histo); 739 } 740 741 //------------------------------------------------------------------------------ 742 743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, 744 const uint8_t* const b, 745 uint32x4_t* const sum) { 746 const uint8x16_t a0 = vld1q_u8(a); 747 const uint8x16_t b0 = vld1q_u8(b); 748 const uint8x16_t abs_diff = vabdq_u8(a0, b0); 749 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff), 750 vget_low_u8(abs_diff)); 751 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff), 752 vget_high_u8(abs_diff)); 753 /* pair-wise adds and widen */ 754 const uint32x4_t sum1 = vpaddlq_u16(prod1); 755 const uint32x4_t sum2 = vpaddlq_u16(prod2); 756 *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2)); 757 } 758 759 // Horizontal sum of all four uint32_t values in 'sum'. 760 static int SumToInt(uint32x4_t sum) { 761 const uint64x2_t sum2 = vpaddlq_u32(sum); 762 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); 763 return (int)sum3; 764 } 765 766 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { 767 uint32x4_t sum = vdupq_n_u32(0); 768 int y; 769 for (y = 0; y < 16; ++y) { 770 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 771 } 772 return SumToInt(sum); 773 } 774 775 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { 776 uint32x4_t sum = vdupq_n_u32(0); 777 int y; 778 for (y = 0; y < 8; ++y) { 779 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 780 } 781 return SumToInt(sum); 782 } 783 784 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { 785 uint32x4_t sum = vdupq_n_u32(0); 786 int y; 787 for (y = 0; y < 8; ++y) { 788 const uint8x8_t a0 = vld1_u8(a + y * BPS); 789 const uint8x8_t b0 = vld1_u8(b + y * BPS); 790 const uint8x8_t abs_diff = vabd_u8(a0, b0); 791 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); 792 sum = vpadalq_u16(sum, prod); 793 } 794 return SumToInt(sum); 795 } 796 797 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { 798 const uint8x16_t a0 = Load4x4(a); 799 const uint8x16_t b0 = Load4x4(b); 800 const uint8x16_t abs_diff = vabdq_u8(a0, b0); 801 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff), 802 vget_low_u8(abs_diff)); 803 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff), 804 vget_high_u8(abs_diff)); 805 /* pair-wise adds and widen */ 806 const uint32x4_t sum1 = vpaddlq_u16(prod1); 807 const uint32x4_t sum2 = vpaddlq_u16(prod2); 808 return SumToInt(vaddq_u32(sum1, sum2)); 809 } 810 811 //------------------------------------------------------------------------------ 812 813 // Compilation with gcc-4.6.x is problematic for now. 814 #if !defined(WORK_AROUND_GCC) 815 816 static int16x8_t Quantize(int16_t* const in, 817 const VP8Matrix* const mtx, int offset) { 818 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); 819 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); 820 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); 821 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); 822 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); 823 824 const int16x8_t a = vld1q_s16(in + offset); // in 825 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) 826 const int16x8_t sign = vshrq_n_s16(a, 15); // sign 827 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen 828 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); 829 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); 830 const uint32x4_t m2 = vhaddq_u32(m0, bias0); 831 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 832 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), 833 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 834 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); 835 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); 836 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign 837 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); 838 vst1q_s16(in + offset, c4); 839 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 840 return c3; 841 } 842 843 static const uint8_t kShuffles[4][8] = { 844 { 0, 1, 2, 3, 8, 9, 16, 17 }, 845 { 10, 11, 4, 5, 6, 7, 12, 13 }, 846 { 18, 19, 24, 25, 26, 27, 20, 21 }, 847 { 14, 15, 22, 23, 28, 29, 30, 31 } 848 }; 849 850 static int QuantizeBlock(int16_t in[16], int16_t out[16], 851 const VP8Matrix* const mtx) { 852 const int16x8_t out0 = Quantize(in, mtx, 0); 853 const int16x8_t out1 = Quantize(in, mtx, 8); 854 uint8x8x4_t shuffles; 855 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use 856 // non-standard versions there. 857 #if defined(__APPLE__) && defined(__aarch64__) && \ 858 defined(__apple_build_version__) && (__apple_build_version__< 6020037) 859 uint8x16x2_t all_out; 860 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); 861 INIT_VECTOR4(shuffles, 862 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])), 863 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])), 864 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])), 865 vtbl2q_u8(all_out, vld1_u8(kShuffles[3]))); 866 #else 867 uint8x8x4_t all_out; 868 INIT_VECTOR4(all_out, 869 vreinterpret_u8_s16(vget_low_s16(out0)), 870 vreinterpret_u8_s16(vget_high_s16(out0)), 871 vreinterpret_u8_s16(vget_low_s16(out1)), 872 vreinterpret_u8_s16(vget_high_s16(out1))); 873 INIT_VECTOR4(shuffles, 874 vtbl4_u8(all_out, vld1_u8(kShuffles[0])), 875 vtbl4_u8(all_out, vld1_u8(kShuffles[1])), 876 vtbl4_u8(all_out, vld1_u8(kShuffles[2])), 877 vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); 878 #endif 879 // Zigzag reordering 880 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); 881 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); 882 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); 883 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); 884 // test zeros 885 if (*(uint64_t*)(out + 0) != 0) return 1; 886 if (*(uint64_t*)(out + 4) != 0) return 1; 887 if (*(uint64_t*)(out + 8) != 0) return 1; 888 if (*(uint64_t*)(out + 12) != 0) return 1; 889 return 0; 890 } 891 892 static int Quantize2Blocks(int16_t in[32], int16_t out[32], 893 const VP8Matrix* const mtx) { 894 int nz; 895 nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; 896 nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; 897 return nz; 898 } 899 900 #endif // !WORK_AROUND_GCC 901 902 //------------------------------------------------------------------------------ 903 // Entry point 904 905 extern void VP8EncDspInitNEON(void); 906 907 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { 908 VP8ITransform = ITransform; 909 VP8FTransform = FTransform; 910 911 VP8FTransformWHT = FTransformWHT; 912 913 VP8TDisto4x4 = Disto4x4; 914 VP8TDisto16x16 = Disto16x16; 915 VP8CollectHistogram = CollectHistogram; 916 917 VP8SSE16x16 = SSE16x16_NEON; 918 VP8SSE16x8 = SSE16x8_NEON; 919 VP8SSE8x8 = SSE8x8_NEON; 920 VP8SSE4x4 = SSE4x4_NEON; 921 922 #if !defined(WORK_AROUND_GCC) 923 VP8EncQuantizeBlock = QuantizeBlock; 924 VP8EncQuantize2Blocks = Quantize2Blocks; 925 #endif 926 } 927 928 #else // !WEBP_USE_NEON 929 930 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON) 931 932 #endif // WEBP_USE_NEON 933