1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // SSE2 version of speed-critical encoding functions. 11 // 12 // Author: Christian Duvivier (cduvivier (at) google.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <assert.h> 18 #include <stdlib.h> // for abs() 19 #include <emmintrin.h> 20 21 #include "src/dsp/common_sse2.h" 22 #include "src/enc/cost_enc.h" 23 #include "src/enc/vp8i_enc.h" 24 25 //------------------------------------------------------------------------------ 26 // Transforms (Paragraph 14.4) 27 28 // Does one or two inverse transforms. 29 static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, 30 int do_two) { 31 // This implementation makes use of 16-bit fixed point versions of two 32 // multiply constants: 33 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 34 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 35 // 36 // To be able to use signed 16-bit integers, we use the following trick to 37 // have constants within range: 38 // - Associated constants are obtained by subtracting the 16-bit fixed point 39 // version of one: 40 // k = K - (1 << 16) => K = k + (1 << 16) 41 // K1 = 85267 => k1 = 20091 42 // K2 = 35468 => k2 = -30068 43 // - The multiplication of a variable by a constant become the sum of the 44 // variable and the multiplication of that variable by the associated 45 // constant: 46 // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x 47 const __m128i k1 = _mm_set1_epi16(20091); 48 const __m128i k2 = _mm_set1_epi16(-30068); 49 __m128i T0, T1, T2, T3; 50 51 // Load and concatenate the transform coefficients (we'll do two inverse 52 // transforms in parallel). In the case of only one inverse transform, the 53 // second half of the vectors will just contain random value we'll never 54 // use nor store. 55 __m128i in0, in1, in2, in3; 56 { 57 in0 = _mm_loadl_epi64((const __m128i*)&in[0]); 58 in1 = _mm_loadl_epi64((const __m128i*)&in[4]); 59 in2 = _mm_loadl_epi64((const __m128i*)&in[8]); 60 in3 = _mm_loadl_epi64((const __m128i*)&in[12]); 61 // a00 a10 a20 a30 x x x x 62 // a01 a11 a21 a31 x x x x 63 // a02 a12 a22 a32 x x x x 64 // a03 a13 a23 a33 x x x x 65 if (do_two) { 66 const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]); 67 const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]); 68 const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]); 69 const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]); 70 in0 = _mm_unpacklo_epi64(in0, inB0); 71 in1 = _mm_unpacklo_epi64(in1, inB1); 72 in2 = _mm_unpacklo_epi64(in2, inB2); 73 in3 = _mm_unpacklo_epi64(in3, inB3); 74 // a00 a10 a20 a30 b00 b10 b20 b30 75 // a01 a11 a21 a31 b01 b11 b21 b31 76 // a02 a12 a22 a32 b02 b12 b22 b32 77 // a03 a13 a23 a33 b03 b13 b23 b33 78 } 79 } 80 81 // Vertical pass and subsequent transpose. 82 { 83 // First pass, c and d calculations are longer because of the "trick" 84 // multiplications. 85 const __m128i a = _mm_add_epi16(in0, in2); 86 const __m128i b = _mm_sub_epi16(in0, in2); 87 // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 88 const __m128i c1 = _mm_mulhi_epi16(in1, k2); 89 const __m128i c2 = _mm_mulhi_epi16(in3, k1); 90 const __m128i c3 = _mm_sub_epi16(in1, in3); 91 const __m128i c4 = _mm_sub_epi16(c1, c2); 92 const __m128i c = _mm_add_epi16(c3, c4); 93 // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 94 const __m128i d1 = _mm_mulhi_epi16(in1, k1); 95 const __m128i d2 = _mm_mulhi_epi16(in3, k2); 96 const __m128i d3 = _mm_add_epi16(in1, in3); 97 const __m128i d4 = _mm_add_epi16(d1, d2); 98 const __m128i d = _mm_add_epi16(d3, d4); 99 100 // Second pass. 101 const __m128i tmp0 = _mm_add_epi16(a, d); 102 const __m128i tmp1 = _mm_add_epi16(b, c); 103 const __m128i tmp2 = _mm_sub_epi16(b, c); 104 const __m128i tmp3 = _mm_sub_epi16(a, d); 105 106 // Transpose the two 4x4. 107 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); 108 } 109 110 // Horizontal pass and subsequent transpose. 111 { 112 // First pass, c and d calculations are longer because of the "trick" 113 // multiplications. 114 const __m128i four = _mm_set1_epi16(4); 115 const __m128i dc = _mm_add_epi16(T0, four); 116 const __m128i a = _mm_add_epi16(dc, T2); 117 const __m128i b = _mm_sub_epi16(dc, T2); 118 // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 119 const __m128i c1 = _mm_mulhi_epi16(T1, k2); 120 const __m128i c2 = _mm_mulhi_epi16(T3, k1); 121 const __m128i c3 = _mm_sub_epi16(T1, T3); 122 const __m128i c4 = _mm_sub_epi16(c1, c2); 123 const __m128i c = _mm_add_epi16(c3, c4); 124 // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 125 const __m128i d1 = _mm_mulhi_epi16(T1, k1); 126 const __m128i d2 = _mm_mulhi_epi16(T3, k2); 127 const __m128i d3 = _mm_add_epi16(T1, T3); 128 const __m128i d4 = _mm_add_epi16(d1, d2); 129 const __m128i d = _mm_add_epi16(d3, d4); 130 131 // Second pass. 132 const __m128i tmp0 = _mm_add_epi16(a, d); 133 const __m128i tmp1 = _mm_add_epi16(b, c); 134 const __m128i tmp2 = _mm_sub_epi16(b, c); 135 const __m128i tmp3 = _mm_sub_epi16(a, d); 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); 140 141 // Transpose the two 4x4. 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, 143 &T2, &T3); 144 } 145 146 // Add inverse transform to 'ref' and store. 147 { 148 const __m128i zero = _mm_setzero_si128(); 149 // Load the reference(s). 150 __m128i ref0, ref1, ref2, ref3; 151 if (do_two) { 152 // Load eight bytes/pixels per line. 153 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 154 ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 155 ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 156 ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 157 } else { 158 // Load four bytes/pixels per line. 159 ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS])); 160 ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS])); 161 ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS])); 162 ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS])); 163 } 164 // Convert to 16b. 165 ref0 = _mm_unpacklo_epi8(ref0, zero); 166 ref1 = _mm_unpacklo_epi8(ref1, zero); 167 ref2 = _mm_unpacklo_epi8(ref2, zero); 168 ref3 = _mm_unpacklo_epi8(ref3, zero); 169 // Add the inverse transform(s). 170 ref0 = _mm_add_epi16(ref0, T0); 171 ref1 = _mm_add_epi16(ref1, T1); 172 ref2 = _mm_add_epi16(ref2, T2); 173 ref3 = _mm_add_epi16(ref3, T3); 174 // Unsigned saturate to 8b. 175 ref0 = _mm_packus_epi16(ref0, ref0); 176 ref1 = _mm_packus_epi16(ref1, ref1); 177 ref2 = _mm_packus_epi16(ref2, ref2); 178 ref3 = _mm_packus_epi16(ref3, ref3); 179 // Store the results. 180 if (do_two) { 181 // Store eight bytes/pixels per line. 182 _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); 183 _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); 184 _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); 185 _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); 186 } else { 187 // Store four bytes/pixels per line. 188 WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); 189 WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); 190 WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); 191 WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); 192 } 193 } 194 } 195 196 static void FTransformPass1_SSE2(const __m128i* const in01, 197 const __m128i* const in23, 198 __m128i* const out01, 199 __m128i* const out32) { 200 const __m128i k937 = _mm_set1_epi32(937); 201 const __m128i k1812 = _mm_set1_epi32(1812); 202 203 const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); 204 const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); 205 const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 206 2217, 5352, 2217, 5352); 207 const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, 208 -5352, 2217, -5352, 2217); 209 210 // *in01 = 00 01 10 11 02 03 12 13 211 // *in23 = 20 21 30 31 22 23 32 33 212 const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1)); 213 const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1)); 214 // 00 01 10 11 03 02 13 12 215 // 20 21 30 31 23 22 33 32 216 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); 217 const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); 218 // 00 01 10 11 20 21 30 31 219 // 03 02 13 12 23 22 33 32 220 const __m128i a01 = _mm_add_epi16(s01, s32); 221 const __m128i a32 = _mm_sub_epi16(s01, s32); 222 // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] 223 // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] 224 225 const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] 226 const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] 227 const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); 228 const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); 229 const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); 230 const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); 231 const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); 232 const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); 233 const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); 234 const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); 235 const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... 236 const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 237 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); 238 *out01 = _mm_unpacklo_epi32(s_lo, s_hi); 239 *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. 240 } 241 242 static void FTransformPass2_SSE2(const __m128i* const v01, 243 const __m128i* const v32, 244 int16_t* out) { 245 const __m128i zero = _mm_setzero_si128(); 246 const __m128i seven = _mm_set1_epi16(7); 247 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 248 5352, 2217, 5352, 2217); 249 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 250 2217, -5352, 2217, -5352); 251 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); 252 const __m128i k51000 = _mm_set1_epi32(51000); 253 254 // Same operations are done on the (0,3) and (1,2) pairs. 255 // a3 = v0 - v3 256 // a2 = v1 - v2 257 const __m128i a32 = _mm_sub_epi16(*v01, *v32); 258 const __m128i a22 = _mm_unpackhi_epi64(a32, a32); 259 260 const __m128i b23 = _mm_unpacklo_epi16(a22, a32); 261 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); 262 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); 263 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); 264 const __m128i d3 = _mm_add_epi32(c3, k51000); 265 const __m128i e1 = _mm_srai_epi32(d1, 16); 266 const __m128i e3 = _mm_srai_epi32(d3, 16); 267 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) 268 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) 269 const __m128i f1 = _mm_packs_epi32(e1, e1); 270 const __m128i f3 = _mm_packs_epi32(e3, e3); 271 // g1 = f1 + (a3 != 0); 272 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the 273 // desired (0, 1), we add one earlier through k12000_plus_one. 274 // -> g1 = f1 + 1 - (a3 == 0) 275 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); 276 277 // a0 = v0 + v3 278 // a1 = v1 + v2 279 const __m128i a01 = _mm_add_epi16(*v01, *v32); 280 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); 281 const __m128i a11 = _mm_unpackhi_epi64(a01, a01); 282 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); 283 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); 284 // d0 = (a0 + a1 + 7) >> 4; 285 // d2 = (a0 - a1 + 7) >> 4; 286 const __m128i d0 = _mm_srai_epi16(c0, 4); 287 const __m128i d2 = _mm_srai_epi16(c2, 4); 288 289 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); 290 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); 291 _mm_storeu_si128((__m128i*)&out[0], d0_g1); 292 _mm_storeu_si128((__m128i*)&out[8], d2_f3); 293 } 294 295 static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, 296 int16_t* out) { 297 const __m128i zero = _mm_setzero_si128(); 298 // Load src. 299 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 300 const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 301 const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 302 const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 303 // 00 01 02 03 * 304 // 10 11 12 13 * 305 // 20 21 22 23 * 306 // 30 31 32 33 * 307 // Shuffle. 308 const __m128i src_0 = _mm_unpacklo_epi16(src0, src1); 309 const __m128i src_1 = _mm_unpacklo_epi16(src2, src3); 310 // 00 01 10 11 02 03 12 13 * * ... 311 // 20 21 30 31 22 22 32 33 * * ... 312 313 // Load ref. 314 const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 315 const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 316 const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 317 const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 318 const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1); 319 const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3); 320 321 // Convert both to 16 bit. 322 const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero); 323 const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero); 324 const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero); 325 const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero); 326 327 // Compute the difference. 328 const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b); 329 const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b); 330 __m128i v01, v32; 331 332 // First pass 333 FTransformPass1_SSE2(&row01, &row23, &v01, &v32); 334 335 // Second pass 336 FTransformPass2_SSE2(&v01, &v32, out); 337 } 338 339 static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, 340 int16_t* out) { 341 const __m128i zero = _mm_setzero_si128(); 342 343 // Load src and convert to 16b. 344 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 345 const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 346 const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 347 const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 348 const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); 349 const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); 350 const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); 351 const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); 352 // Load ref and convert to 16b. 353 const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 354 const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 355 const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 356 const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 357 const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); 358 const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); 359 const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); 360 const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); 361 // Compute difference. -> 00 01 02 03 00' 01' 02' 03' 362 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); 363 const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); 364 const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); 365 const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); 366 367 // Unpack and shuffle 368 // 00 01 02 03 0 0 0 0 369 // 10 11 12 13 0 0 0 0 370 // 20 21 22 23 0 0 0 0 371 // 30 31 32 33 0 0 0 0 372 const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1); 373 const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3); 374 const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1); 375 const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3); 376 __m128i v01l, v32l; 377 __m128i v01h, v32h; 378 379 // First pass 380 FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l); 381 FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h); 382 383 // Second pass 384 FTransformPass2_SSE2(&v01l, &v32l, out + 0); 385 FTransformPass2_SSE2(&v01h, &v32h, out + 16); 386 } 387 388 static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { 389 const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); 390 const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); 391 const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); 392 const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]); 393 const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]); 394 const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ... 395 const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ... 396 const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ... 397 const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ... 398 const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ... 399 const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ... 400 const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1 401 *out = _mm_madd_epi16(D, kMult); 402 } 403 404 static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { 405 // Input is 12b signed. 406 __m128i row0, row1, row2, row3; 407 // Rows are 14b signed. 408 FTransformWHTRow_SSE2(in + 0 * 64, &row0); 409 FTransformWHTRow_SSE2(in + 1 * 64, &row1); 410 FTransformWHTRow_SSE2(in + 2 * 64, &row2); 411 FTransformWHTRow_SSE2(in + 3 * 64, &row3); 412 413 { 414 // The a* are 15b signed. 415 const __m128i a0 = _mm_add_epi32(row0, row2); 416 const __m128i a1 = _mm_add_epi32(row1, row3); 417 const __m128i a2 = _mm_sub_epi32(row1, row3); 418 const __m128i a3 = _mm_sub_epi32(row0, row2); 419 const __m128i a0a3 = _mm_packs_epi32(a0, a3); 420 const __m128i a1a2 = _mm_packs_epi32(a1, a2); 421 422 // The b* are 16b signed. 423 const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2); 424 const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2); 425 const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2); 426 const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2); 427 428 _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1)); 429 _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1)); 430 } 431 } 432 433 //------------------------------------------------------------------------------ 434 // Compute susceptibility based on DCT-coeff histograms: 435 // the higher, the "easier" the macroblock is to compress. 436 437 static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, 438 int start_block, int end_block, 439 VP8Histogram* const histo) { 440 const __m128i zero = _mm_setzero_si128(); 441 const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); 442 int j; 443 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 444 for (j = start_block; j < end_block; ++j) { 445 int16_t out[16]; 446 int k; 447 448 FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 449 450 // Convert coefficients to bin (within out[]). 451 { 452 // Load. 453 const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); 454 const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); 455 const __m128i d0 = _mm_sub_epi16(zero, out0); 456 const __m128i d1 = _mm_sub_epi16(zero, out1); 457 const __m128i abs0 = _mm_max_epi16(out0, d0); // abs(v), 16b 458 const __m128i abs1 = _mm_max_epi16(out1, d1); 459 // v = abs(out) >> 3 460 const __m128i v0 = _mm_srai_epi16(abs0, 3); 461 const __m128i v1 = _mm_srai_epi16(abs1, 3); 462 // bin = min(v, MAX_COEFF_THRESH) 463 const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); 464 const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); 465 // Store. 466 _mm_storeu_si128((__m128i*)&out[0], bin0); 467 _mm_storeu_si128((__m128i*)&out[8], bin1); 468 } 469 470 // Convert coefficients to bin. 471 for (k = 0; k < 16; ++k) { 472 ++distribution[out[k]]; 473 } 474 } 475 VP8SetHistogramData(distribution, histo); 476 } 477 478 //------------------------------------------------------------------------------ 479 // Intra predictions 480 481 // helper for chroma-DC predictions 482 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { 483 int j; 484 const __m128i values = _mm_set1_epi8(v); 485 for (j = 0; j < 8; ++j) { 486 _mm_storel_epi64((__m128i*)(dst + j * BPS), values); 487 } 488 } 489 490 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { 491 int j; 492 const __m128i values = _mm_set1_epi8(v); 493 for (j = 0; j < 16; ++j) { 494 _mm_store_si128((__m128i*)(dst + j * BPS), values); 495 } 496 } 497 498 static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) { 499 if (size == 4) { 500 int j; 501 for (j = 0; j < 4; ++j) { 502 memset(dst + j * BPS, value, 4); 503 } 504 } else if (size == 8) { 505 Put8x8uv_SSE2(value, dst); 506 } else { 507 Put16_SSE2(value, dst); 508 } 509 } 510 511 static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { 512 int j; 513 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 514 for (j = 0; j < 8; ++j) { 515 _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values); 516 } 517 } 518 519 static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { 520 const __m128i top_values = _mm_load_si128((const __m128i*)top); 521 int j; 522 for (j = 0; j < 16; ++j) { 523 _mm_store_si128((__m128i*)(dst + j * BPS), top_values); 524 } 525 } 526 527 static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, 528 const uint8_t* top, int size) { 529 if (top != NULL) { 530 if (size == 8) { 531 VE8uv_SSE2(dst, top); 532 } else { 533 VE16_SSE2(dst, top); 534 } 535 } else { 536 Fill_SSE2(dst, 127, size); 537 } 538 } 539 540 static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { 541 int j; 542 for (j = 0; j < 8; ++j) { 543 const __m128i values = _mm_set1_epi8(left[j]); 544 _mm_storel_epi64((__m128i*)dst, values); 545 dst += BPS; 546 } 547 } 548 549 static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { 550 int j; 551 for (j = 0; j < 16; ++j) { 552 const __m128i values = _mm_set1_epi8(left[j]); 553 _mm_store_si128((__m128i*)dst, values); 554 dst += BPS; 555 } 556 } 557 558 static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, 559 const uint8_t* left, int size) { 560 if (left != NULL) { 561 if (size == 8) { 562 HE8uv_SSE2(dst, left); 563 } else { 564 HE16_SSE2(dst, left); 565 } 566 } else { 567 Fill_SSE2(dst, 129, size); 568 } 569 } 570 571 static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, 572 const uint8_t* top, int size) { 573 const __m128i zero = _mm_setzero_si128(); 574 int y; 575 if (size == 8) { 576 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 577 const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); 578 for (y = 0; y < 8; ++y, dst += BPS) { 579 const int val = left[y] - left[-1]; 580 const __m128i base = _mm_set1_epi16(val); 581 const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); 582 _mm_storel_epi64((__m128i*)dst, out); 583 } 584 } else { 585 const __m128i top_values = _mm_load_si128((const __m128i*)top); 586 const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero); 587 const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero); 588 for (y = 0; y < 16; ++y, dst += BPS) { 589 const int val = left[y] - left[-1]; 590 const __m128i base = _mm_set1_epi16(val); 591 const __m128i out_0 = _mm_add_epi16(base, top_base_0); 592 const __m128i out_1 = _mm_add_epi16(base, top_base_1); 593 const __m128i out = _mm_packus_epi16(out_0, out_1); 594 _mm_store_si128((__m128i*)dst, out); 595 } 596 } 597 } 598 599 static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, 600 const uint8_t* top, int size) { 601 if (left != NULL) { 602 if (top != NULL) { 603 TM_SSE2(dst, left, top, size); 604 } else { 605 HorizontalPred_SSE2(dst, left, size); 606 } 607 } else { 608 // true motion without left samples (hence: with default 129 value) 609 // is equivalent to VE prediction where you just copy the top samples. 610 // Note that if top samples are not available, the default value is 611 // then 129, and not 127 as in the VerticalPred case. 612 if (top != NULL) { 613 VerticalPred_SSE2(dst, top, size); 614 } else { 615 Fill_SSE2(dst, 129, size); 616 } 617 } 618 } 619 620 static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, 621 const uint8_t* top) { 622 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 623 const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); 624 const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); 625 const int DC = VP8HorizontalAdd8b(&combined) + 8; 626 Put8x8uv_SSE2(DC >> 4, dst); 627 } 628 629 static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { 630 const __m128i zero = _mm_setzero_si128(); 631 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 632 const __m128i sum = _mm_sad_epu8(top_values, zero); 633 const int DC = _mm_cvtsi128_si32(sum) + 4; 634 Put8x8uv_SSE2(DC >> 3, dst); 635 } 636 637 static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) { 638 // 'left' is contiguous so we can reuse the top summation. 639 DC8uvNoLeft_SSE2(dst, left); 640 } 641 642 static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) { 643 Put8x8uv_SSE2(0x80, dst); 644 } 645 646 static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, 647 const uint8_t* top) { 648 if (top != NULL) { 649 if (left != NULL) { // top and left present 650 DC8uv_SSE2(dst, left, top); 651 } else { // top, but no left 652 DC8uvNoLeft_SSE2(dst, top); 653 } 654 } else if (left != NULL) { // left but no top 655 DC8uvNoTop_SSE2(dst, left); 656 } else { // no top, no left, nothing. 657 DC8uvNoTopLeft_SSE2(dst); 658 } 659 } 660 661 static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, 662 const uint8_t* top) { 663 const __m128i top_row = _mm_load_si128((const __m128i*)top); 664 const __m128i left_row = _mm_load_si128((const __m128i*)left); 665 const int DC = 666 VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16; 667 Put16_SSE2(DC >> 5, dst); 668 } 669 670 static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) { 671 const __m128i top_row = _mm_load_si128((const __m128i*)top); 672 const int DC = VP8HorizontalAdd8b(&top_row) + 8; 673 Put16_SSE2(DC >> 4, dst); 674 } 675 676 static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) { 677 // 'left' is contiguous so we can reuse the top summation. 678 DC16NoLeft_SSE2(dst, left); 679 } 680 681 static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) { 682 Put16_SSE2(0x80, dst); 683 } 684 685 static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, 686 const uint8_t* top) { 687 if (top != NULL) { 688 if (left != NULL) { // top and left present 689 DC16_SSE2(dst, left, top); 690 } else { // top, but no left 691 DC16NoLeft_SSE2(dst, top); 692 } 693 } else if (left != NULL) { // left but no top 694 DC16NoTop_SSE2(dst, left); 695 } else { // no top, no left, nothing. 696 DC16NoTopLeft_SSE2(dst); 697 } 698 } 699 700 //------------------------------------------------------------------------------ 701 // 4x4 predictions 702 703 #define DST(x, y) dst[(x) + (y) * BPS] 704 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) 705 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 706 707 // We use the following 8b-arithmetic tricks: 708 // (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1 709 // where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1] 710 // and: 711 // (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb 712 // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 713 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 714 715 static WEBP_INLINE void VE4_SSE2(uint8_t* dst, 716 const uint8_t* top) { // vertical 717 const __m128i one = _mm_set1_epi8(1); 718 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); 719 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); 720 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); 721 const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00); 722 const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); 723 const __m128i b = _mm_subs_epu8(a, lsb); 724 const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); 725 const uint32_t vals = _mm_cvtsi128_si32(avg); 726 int i; 727 for (i = 0; i < 4; ++i) { 728 WebPUint32ToMem(dst + i * BPS, vals); 729 } 730 } 731 732 static WEBP_INLINE void HE4_SSE2(uint8_t* dst, 733 const uint8_t* top) { // horizontal 734 const int X = top[-1]; 735 const int I = top[-2]; 736 const int J = top[-3]; 737 const int K = top[-4]; 738 const int L = top[-5]; 739 WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); 740 WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); 741 WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); 742 WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); 743 } 744 745 static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) { 746 uint32_t dc = 4; 747 int i; 748 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 749 Fill_SSE2(dst, dc >> 3, 4); 750 } 751 752 static WEBP_INLINE void LD4_SSE2(uint8_t* dst, 753 const uint8_t* top) { // Down-Left 754 const __m128i one = _mm_set1_epi8(1); 755 const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); 756 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); 757 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); 758 const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3); 759 const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0); 760 const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); 761 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 762 const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); 763 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); 764 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); 765 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); 766 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); 767 } 768 769 static WEBP_INLINE void VR4_SSE2(uint8_t* dst, 770 const uint8_t* top) { // Vertical-Right 771 const __m128i one = _mm_set1_epi8(1); 772 const int I = top[-2]; 773 const int J = top[-3]; 774 const int K = top[-4]; 775 const int X = top[-1]; 776 const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1)); 777 const __m128i ABCD0 = _mm_srli_si128(XABCD, 1); 778 const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0); 779 const __m128i _XABCD = _mm_slli_si128(XABCD, 1); 780 const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0); 781 const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0); 782 const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); 783 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 784 const __m128i efgh = _mm_avg_epu8(avg2, XABCD); 785 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); 786 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); 787 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); 788 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); 789 790 // these two are hard to implement in SSE2, so we keep the C-version: 791 DST(0, 2) = AVG3(J, I, X); 792 DST(0, 3) = AVG3(K, J, I); 793 } 794 795 static WEBP_INLINE void VL4_SSE2(uint8_t* dst, 796 const uint8_t* top) { // Vertical-Left 797 const __m128i one = _mm_set1_epi8(1); 798 const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); 799 const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); 800 const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2); 801 const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_); 802 const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_); 803 const __m128i avg3 = _mm_avg_epu8(avg1, avg2); 804 const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one); 805 const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_); 806 const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_); 807 const __m128i abbc = _mm_or_si128(ab, bc); 808 const __m128i lsb2 = _mm_and_si128(abbc, lsb1); 809 const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); 810 const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); 811 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); 812 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); 813 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); 814 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); 815 816 // these two are hard to get and irregular 817 DST(3, 2) = (extra_out >> 0) & 0xff; 818 DST(3, 3) = (extra_out >> 8) & 0xff; 819 } 820 821 static WEBP_INLINE void RD4_SSE2(uint8_t* dst, 822 const uint8_t* top) { // Down-right 823 const __m128i one = _mm_set1_epi8(1); 824 const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); 825 const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); 826 const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1); 827 const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2); 828 const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD); 829 const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); 830 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 831 const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); 832 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); 833 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); 834 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); 835 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); 836 } 837 838 static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { 839 const int I = top[-2]; 840 const int J = top[-3]; 841 const int K = top[-4]; 842 const int L = top[-5]; 843 DST(0, 0) = AVG2(I, J); 844 DST(2, 0) = DST(0, 1) = AVG2(J, K); 845 DST(2, 1) = DST(0, 2) = AVG2(K, L); 846 DST(1, 0) = AVG3(I, J, K); 847 DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 848 DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 849 DST(3, 2) = DST(2, 2) = 850 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 851 } 852 853 static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { 854 const int X = top[-1]; 855 const int I = top[-2]; 856 const int J = top[-3]; 857 const int K = top[-4]; 858 const int L = top[-5]; 859 const int A = top[0]; 860 const int B = top[1]; 861 const int C = top[2]; 862 863 DST(0, 0) = DST(2, 1) = AVG2(I, X); 864 DST(0, 1) = DST(2, 2) = AVG2(J, I); 865 DST(0, 2) = DST(2, 3) = AVG2(K, J); 866 DST(0, 3) = AVG2(L, K); 867 868 DST(3, 0) = AVG3(A, B, C); 869 DST(2, 0) = AVG3(X, A, B); 870 DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 871 DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 872 DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 873 DST(1, 3) = AVG3(L, K, J); 874 } 875 876 static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { 877 const __m128i zero = _mm_setzero_si128(); 878 const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); 879 const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); 880 int y; 881 for (y = 0; y < 4; ++y, dst += BPS) { 882 const int val = top[-2 - y] - top[-1]; 883 const __m128i base = _mm_set1_epi16(val); 884 const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); 885 WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); 886 } 887 } 888 889 #undef DST 890 #undef AVG3 891 #undef AVG2 892 893 //------------------------------------------------------------------------------ 894 // luma 4x4 prediction 895 896 // Left samples are top[-5 .. -2], top_left is top[-1], top are 897 // located at top[0..3], and top right is top[4..7] 898 static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { 899 DC4_SSE2(I4DC4 + dst, top); 900 TM4_SSE2(I4TM4 + dst, top); 901 VE4_SSE2(I4VE4 + dst, top); 902 HE4_SSE2(I4HE4 + dst, top); 903 RD4_SSE2(I4RD4 + dst, top); 904 VR4_SSE2(I4VR4 + dst, top); 905 LD4_SSE2(I4LD4 + dst, top); 906 VL4_SSE2(I4VL4 + dst, top); 907 HD4_SSE2(I4HD4 + dst, top); 908 HU4_SSE2(I4HU4 + dst, top); 909 } 910 911 //------------------------------------------------------------------------------ 912 // Chroma 8x8 prediction (paragraph 12.2) 913 914 static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, 915 const uint8_t* top) { 916 // U block 917 DC8uvMode_SSE2(C8DC8 + dst, left, top); 918 VerticalPred_SSE2(C8VE8 + dst, top, 8); 919 HorizontalPred_SSE2(C8HE8 + dst, left, 8); 920 TrueMotion_SSE2(C8TM8 + dst, left, top, 8); 921 // V block 922 dst += 8; 923 if (top != NULL) top += 8; 924 if (left != NULL) left += 16; 925 DC8uvMode_SSE2(C8DC8 + dst, left, top); 926 VerticalPred_SSE2(C8VE8 + dst, top, 8); 927 HorizontalPred_SSE2(C8HE8 + dst, left, 8); 928 TrueMotion_SSE2(C8TM8 + dst, left, top, 8); 929 } 930 931 //------------------------------------------------------------------------------ 932 // luma 16x16 prediction (paragraph 12.3) 933 934 static void Intra16Preds_SSE2(uint8_t* dst, 935 const uint8_t* left, const uint8_t* top) { 936 DC16Mode_SSE2(I16DC16 + dst, left, top); 937 VerticalPred_SSE2(I16VE16 + dst, top, 16); 938 HorizontalPred_SSE2(I16HE16 + dst, left, 16); 939 TrueMotion_SSE2(I16TM16 + dst, left, top, 16); 940 } 941 942 //------------------------------------------------------------------------------ 943 // Metric 944 945 static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a, 946 const __m128i b, 947 __m128i* const sum) { 948 // take abs(a-b) in 8b 949 const __m128i a_b = _mm_subs_epu8(a, b); 950 const __m128i b_a = _mm_subs_epu8(b, a); 951 const __m128i abs_a_b = _mm_or_si128(a_b, b_a); 952 // zero-extend to 16b 953 const __m128i zero = _mm_setzero_si128(); 954 const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); 955 const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); 956 // multiply with self 957 const __m128i sum1 = _mm_madd_epi16(C0, C0); 958 const __m128i sum2 = _mm_madd_epi16(C1, C1); 959 *sum = _mm_add_epi32(sum1, sum2); 960 } 961 962 static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, 963 int num_pairs) { 964 __m128i sum = _mm_setzero_si128(); 965 int32_t tmp[4]; 966 int i; 967 968 for (i = 0; i < num_pairs; ++i) { 969 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]); 970 const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]); 971 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]); 972 const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]); 973 __m128i sum1, sum2; 974 SubtractAndAccumulate_SSE2(a0, b0, &sum1); 975 SubtractAndAccumulate_SSE2(a1, b1, &sum2); 976 sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2)); 977 a += 2 * BPS; 978 b += 2 * BPS; 979 } 980 _mm_storeu_si128((__m128i*)tmp, sum); 981 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 982 } 983 984 static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) { 985 return SSE_16xN_SSE2(a, b, 8); 986 } 987 988 static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) { 989 return SSE_16xN_SSE2(a, b, 4); 990 } 991 992 #define LOAD_8x16b(ptr) \ 993 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) 994 995 static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { 996 const __m128i zero = _mm_setzero_si128(); 997 int num_pairs = 4; 998 __m128i sum = zero; 999 int32_t tmp[4]; 1000 while (num_pairs-- > 0) { 1001 const __m128i a0 = LOAD_8x16b(&a[BPS * 0]); 1002 const __m128i a1 = LOAD_8x16b(&a[BPS * 1]); 1003 const __m128i b0 = LOAD_8x16b(&b[BPS * 0]); 1004 const __m128i b1 = LOAD_8x16b(&b[BPS * 1]); 1005 // subtract 1006 const __m128i c0 = _mm_subs_epi16(a0, b0); 1007 const __m128i c1 = _mm_subs_epi16(a1, b1); 1008 // multiply/accumulate with self 1009 const __m128i d0 = _mm_madd_epi16(c0, c0); 1010 const __m128i d1 = _mm_madd_epi16(c1, c1); 1011 // collect 1012 const __m128i sum01 = _mm_add_epi32(d0, d1); 1013 sum = _mm_add_epi32(sum, sum01); 1014 a += 2 * BPS; 1015 b += 2 * BPS; 1016 } 1017 _mm_storeu_si128((__m128i*)tmp, sum); 1018 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1019 } 1020 #undef LOAD_8x16b 1021 1022 static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { 1023 const __m128i zero = _mm_setzero_si128(); 1024 1025 // Load values. Note that we read 8 pixels instead of 4, 1026 // but the a/b buffers are over-allocated to that effect. 1027 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]); 1028 const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]); 1029 const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]); 1030 const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]); 1031 const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]); 1032 const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]); 1033 const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]); 1034 const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]); 1035 // Combine pair of lines. 1036 const __m128i a01 = _mm_unpacklo_epi32(a0, a1); 1037 const __m128i a23 = _mm_unpacklo_epi32(a2, a3); 1038 const __m128i b01 = _mm_unpacklo_epi32(b0, b1); 1039 const __m128i b23 = _mm_unpacklo_epi32(b2, b3); 1040 // Convert to 16b. 1041 const __m128i a01s = _mm_unpacklo_epi8(a01, zero); 1042 const __m128i a23s = _mm_unpacklo_epi8(a23, zero); 1043 const __m128i b01s = _mm_unpacklo_epi8(b01, zero); 1044 const __m128i b23s = _mm_unpacklo_epi8(b23, zero); 1045 // subtract, square and accumulate 1046 const __m128i d0 = _mm_subs_epi16(a01s, b01s); 1047 const __m128i d1 = _mm_subs_epi16(a23s, b23s); 1048 const __m128i e0 = _mm_madd_epi16(d0, d0); 1049 const __m128i e1 = _mm_madd_epi16(d1, d1); 1050 const __m128i sum = _mm_add_epi32(e0, e1); 1051 1052 int32_t tmp[4]; 1053 _mm_storeu_si128((__m128i*)tmp, sum); 1054 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1055 } 1056 1057 //------------------------------------------------------------------------------ 1058 1059 static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { 1060 const __m128i mask = _mm_set1_epi16(0x00ff); 1061 const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); 1062 const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); 1063 const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]); 1064 const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]); 1065 const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte 1066 const __m128i b1 = _mm_srli_epi16(a1, 8); 1067 const __m128i b2 = _mm_srli_epi16(a2, 8); 1068 const __m128i b3 = _mm_srli_epi16(a3, 8); 1069 const __m128i c0 = _mm_and_si128(a0, mask); // lo byte 1070 const __m128i c1 = _mm_and_si128(a1, mask); 1071 const __m128i c2 = _mm_and_si128(a2, mask); 1072 const __m128i c3 = _mm_and_si128(a3, mask); 1073 const __m128i d0 = _mm_add_epi32(b0, c0); 1074 const __m128i d1 = _mm_add_epi32(b1, c1); 1075 const __m128i d2 = _mm_add_epi32(b2, c2); 1076 const __m128i d3 = _mm_add_epi32(b3, c3); 1077 const __m128i e0 = _mm_add_epi32(d0, d1); 1078 const __m128i e1 = _mm_add_epi32(d2, d3); 1079 const __m128i f0 = _mm_add_epi32(e0, e1); 1080 uint16_t tmp[8]; 1081 _mm_storeu_si128((__m128i*)tmp, f0); 1082 dc[0] = tmp[0] + tmp[1]; 1083 dc[1] = tmp[2] + tmp[3]; 1084 dc[2] = tmp[4] + tmp[5]; 1085 dc[3] = tmp[6] + tmp[7]; 1086 } 1087 1088 //------------------------------------------------------------------------------ 1089 // Texture distortion 1090 // 1091 // We try to match the spectral content (weighted) between source and 1092 // reconstructed samples. 1093 1094 // Hadamard transform 1095 // Returns the weighted sum of the absolute value of transformed coefficients. 1096 // w[] contains a row-major 4 by 4 symmetric matrix. 1097 static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, 1098 const uint16_t* const w) { 1099 int32_t sum[4]; 1100 __m128i tmp_0, tmp_1, tmp_2, tmp_3; 1101 const __m128i zero = _mm_setzero_si128(); 1102 1103 // Load and combine inputs. 1104 { 1105 const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); 1106 const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); 1107 const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); 1108 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); 1109 const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); 1110 const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); 1111 const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); 1112 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); 1113 1114 // Combine inA and inB (we'll do two transforms in parallel). 1115 const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); 1116 const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); 1117 const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); 1118 const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); 1119 tmp_0 = _mm_unpacklo_epi8(inAB_0, zero); 1120 tmp_1 = _mm_unpacklo_epi8(inAB_1, zero); 1121 tmp_2 = _mm_unpacklo_epi8(inAB_2, zero); 1122 tmp_3 = _mm_unpacklo_epi8(inAB_3, zero); 1123 // a00 a01 a02 a03 b00 b01 b02 b03 1124 // a10 a11 a12 a13 b10 b11 b12 b13 1125 // a20 a21 a22 a23 b20 b21 b22 b23 1126 // a30 a31 a32 a33 b30 b31 b32 b33 1127 } 1128 1129 // Vertical pass first to avoid a transpose (vertical and horizontal passes 1130 // are commutative because w/kWeightY is symmetric) and subsequent transpose. 1131 { 1132 // Calculate a and b (two 4x4 at once). 1133 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1134 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1135 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1136 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1137 const __m128i b0 = _mm_add_epi16(a0, a1); 1138 const __m128i b1 = _mm_add_epi16(a3, a2); 1139 const __m128i b2 = _mm_sub_epi16(a3, a2); 1140 const __m128i b3 = _mm_sub_epi16(a0, a1); 1141 // a00 a01 a02 a03 b00 b01 b02 b03 1142 // a10 a11 a12 a13 b10 b11 b12 b13 1143 // a20 a21 a22 a23 b20 b21 b22 b23 1144 // a30 a31 a32 a33 b30 b31 b32 b33 1145 1146 // Transpose the two 4x4. 1147 VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); 1148 } 1149 1150 // Horizontal pass and difference of weighted sums. 1151 { 1152 // Load all inputs. 1153 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); 1154 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); 1155 1156 // Calculate a and b (two 4x4 at once). 1157 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1158 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1159 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1160 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1161 const __m128i b0 = _mm_add_epi16(a0, a1); 1162 const __m128i b1 = _mm_add_epi16(a3, a2); 1163 const __m128i b2 = _mm_sub_epi16(a3, a2); 1164 const __m128i b3 = _mm_sub_epi16(a0, a1); 1165 1166 // Separate the transforms of inA and inB. 1167 __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); 1168 __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); 1169 __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); 1170 __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); 1171 1172 { 1173 const __m128i d0 = _mm_sub_epi16(zero, A_b0); 1174 const __m128i d1 = _mm_sub_epi16(zero, A_b2); 1175 const __m128i d2 = _mm_sub_epi16(zero, B_b0); 1176 const __m128i d3 = _mm_sub_epi16(zero, B_b2); 1177 A_b0 = _mm_max_epi16(A_b0, d0); // abs(v), 16b 1178 A_b2 = _mm_max_epi16(A_b2, d1); 1179 B_b0 = _mm_max_epi16(B_b0, d2); 1180 B_b2 = _mm_max_epi16(B_b2, d3); 1181 } 1182 1183 // weighted sums 1184 A_b0 = _mm_madd_epi16(A_b0, w_0); 1185 A_b2 = _mm_madd_epi16(A_b2, w_8); 1186 B_b0 = _mm_madd_epi16(B_b0, w_0); 1187 B_b2 = _mm_madd_epi16(B_b2, w_8); 1188 A_b0 = _mm_add_epi32(A_b0, A_b2); 1189 B_b0 = _mm_add_epi32(B_b0, B_b2); 1190 1191 // difference of weighted sums 1192 A_b0 = _mm_sub_epi32(A_b0, B_b0); 1193 _mm_storeu_si128((__m128i*)&sum[0], A_b0); 1194 } 1195 return sum[0] + sum[1] + sum[2] + sum[3]; 1196 } 1197 1198 static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b, 1199 const uint16_t* const w) { 1200 const int diff_sum = TTransform_SSE2(a, b, w); 1201 return abs(diff_sum) >> 5; 1202 } 1203 1204 static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, 1205 const uint16_t* const w) { 1206 int D = 0; 1207 int x, y; 1208 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 1209 for (x = 0; x < 16; x += 4) { 1210 D += Disto4x4_SSE2(a + x + y, b + x + y, w); 1211 } 1212 } 1213 return D; 1214 } 1215 1216 //------------------------------------------------------------------------------ 1217 // Quantization 1218 // 1219 1220 static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], 1221 const uint16_t* const sharpen, 1222 const VP8Matrix* const mtx) { 1223 const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); 1224 const __m128i zero = _mm_setzero_si128(); 1225 __m128i coeff0, coeff8; 1226 __m128i out0, out8; 1227 __m128i packed_out; 1228 1229 // Load all inputs. 1230 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); 1231 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); 1232 const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); 1233 const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); 1234 const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); 1235 const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); 1236 1237 // extract sign(in) (0x0000 if positive, 0xffff if negative) 1238 const __m128i sign0 = _mm_cmpgt_epi16(zero, in0); 1239 const __m128i sign8 = _mm_cmpgt_epi16(zero, in8); 1240 1241 // coeff = abs(in) = (in ^ sign) - sign 1242 coeff0 = _mm_xor_si128(in0, sign0); 1243 coeff8 = _mm_xor_si128(in8, sign8); 1244 coeff0 = _mm_sub_epi16(coeff0, sign0); 1245 coeff8 = _mm_sub_epi16(coeff8, sign8); 1246 1247 // coeff = abs(in) + sharpen 1248 if (sharpen != NULL) { 1249 const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); 1250 const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); 1251 coeff0 = _mm_add_epi16(coeff0, sharpen0); 1252 coeff8 = _mm_add_epi16(coeff8, sharpen8); 1253 } 1254 1255 // out = (coeff * iQ + B) >> QFIX 1256 { 1257 // doing calculations with 32b precision (QFIX=17) 1258 // out = (coeff * iQ) 1259 const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); 1260 const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); 1261 const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); 1262 const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); 1263 __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); 1264 __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); 1265 __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); 1266 __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); 1267 // out = (coeff * iQ + B) 1268 const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); 1269 const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); 1270 const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); 1271 const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); 1272 out_00 = _mm_add_epi32(out_00, bias_00); 1273 out_04 = _mm_add_epi32(out_04, bias_04); 1274 out_08 = _mm_add_epi32(out_08, bias_08); 1275 out_12 = _mm_add_epi32(out_12, bias_12); 1276 // out = QUANTDIV(coeff, iQ, B, QFIX) 1277 out_00 = _mm_srai_epi32(out_00, QFIX); 1278 out_04 = _mm_srai_epi32(out_04, QFIX); 1279 out_08 = _mm_srai_epi32(out_08, QFIX); 1280 out_12 = _mm_srai_epi32(out_12, QFIX); 1281 1282 // pack result as 16b 1283 out0 = _mm_packs_epi32(out_00, out_04); 1284 out8 = _mm_packs_epi32(out_08, out_12); 1285 1286 // if (coeff > 2047) coeff = 2047 1287 out0 = _mm_min_epi16(out0, max_coeff_2047); 1288 out8 = _mm_min_epi16(out8, max_coeff_2047); 1289 } 1290 1291 // get sign back (if (sign[j]) out_n = -out_n) 1292 out0 = _mm_xor_si128(out0, sign0); 1293 out8 = _mm_xor_si128(out8, sign8); 1294 out0 = _mm_sub_epi16(out0, sign0); 1295 out8 = _mm_sub_epi16(out8, sign8); 1296 1297 // in = out * Q 1298 in0 = _mm_mullo_epi16(out0, q0); 1299 in8 = _mm_mullo_epi16(out8, q8); 1300 1301 _mm_storeu_si128((__m128i*)&in[0], in0); 1302 _mm_storeu_si128((__m128i*)&in[8], in8); 1303 1304 // zigzag the output before storing it. 1305 // 1306 // The zigzag pattern can almost be reproduced with a small sequence of 1307 // shuffles. After it, we only need to swap the 7th (ending up in third 1308 // position instead of twelfth) and 8th values. 1309 { 1310 __m128i outZ0, outZ8; 1311 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); 1312 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); 1313 outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); 1314 outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); 1315 outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); 1316 outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); 1317 _mm_storeu_si128((__m128i*)&out[0], outZ0); 1318 _mm_storeu_si128((__m128i*)&out[8], outZ8); 1319 packed_out = _mm_packs_epi16(outZ0, outZ8); 1320 } 1321 { 1322 const int16_t outZ_12 = out[12]; 1323 const int16_t outZ_3 = out[3]; 1324 out[3] = outZ_12; 1325 out[12] = outZ_3; 1326 } 1327 1328 // detect if all 'out' values are zeroes or not 1329 return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); 1330 } 1331 1332 static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16], 1333 const VP8Matrix* const mtx) { 1334 return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx); 1335 } 1336 1337 static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16], 1338 const VP8Matrix* const mtx) { 1339 return DoQuantizeBlock_SSE2(in, out, NULL, mtx); 1340 } 1341 1342 static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32], 1343 const VP8Matrix* const mtx) { 1344 int nz; 1345 const uint16_t* const sharpen = &mtx->sharpen_[0]; 1346 nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; 1347 nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; 1348 return nz; 1349 } 1350 1351 //------------------------------------------------------------------------------ 1352 // Entry point 1353 1354 extern void VP8EncDspInitSSE2(void); 1355 1356 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { 1357 VP8CollectHistogram = CollectHistogram_SSE2; 1358 VP8EncPredLuma16 = Intra16Preds_SSE2; 1359 VP8EncPredChroma8 = IntraChromaPreds_SSE2; 1360 VP8EncPredLuma4 = Intra4Preds_SSE2; 1361 VP8EncQuantizeBlock = QuantizeBlock_SSE2; 1362 VP8EncQuantize2Blocks = Quantize2Blocks_SSE2; 1363 VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2; 1364 VP8ITransform = ITransform_SSE2; 1365 VP8FTransform = FTransform_SSE2; 1366 VP8FTransform2 = FTransform2_SSE2; 1367 VP8FTransformWHT = FTransformWHT_SSE2; 1368 VP8SSE16x16 = SSE16x16_SSE2; 1369 VP8SSE16x8 = SSE16x8_SSE2; 1370 VP8SSE8x8 = SSE8x8_SSE2; 1371 VP8SSE4x4 = SSE4x4_SSE2; 1372 VP8TDisto4x4 = Disto4x4_SSE2; 1373 VP8TDisto16x16 = Disto16x16_SSE2; 1374 VP8Mean16x4 = Mean16x4_SSE2; 1375 } 1376 1377 #else // !WEBP_USE_SSE2 1378 1379 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) 1380 1381 #endif // WEBP_USE_SSE2 1382