1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <emmintrin.h> // SSE2 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_dsp/x86/inv_txfm_sse2.h" 15 #include "vpx_dsp/x86/transpose_sse2.h" 16 #include "vpx_dsp/x86/txfm_common_sse2.h" 17 18 static INLINE void transpose_16bit_4(__m128i *res) { 19 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 20 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 21 22 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 23 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 24 } 25 26 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, 27 int stride) { 28 const __m128i eight = _mm_set1_epi16(8); 29 __m128i in[2]; 30 31 // Rows 32 in[0] = load_input_data8(input); 33 in[1] = load_input_data8(input + 8); 34 idct4_sse2(in); 35 36 // Columns 37 idct4_sse2(in); 38 39 // Final round and shift 40 in[0] = _mm_add_epi16(in[0], eight); 41 in[1] = _mm_add_epi16(in[1], eight); 42 in[0] = _mm_srai_epi16(in[0], 4); 43 in[1] = _mm_srai_epi16(in[1], 4); 44 45 recon_and_store4x4_sse2(in, dest, stride); 46 } 47 48 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, 49 int stride) { 50 const __m128i zero = _mm_setzero_si128(); 51 int a; 52 __m128i dc_value, d[2]; 53 54 a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64); 55 a = (int)dct_const_round_shift(a * cospi_16_64); 56 a = ROUND_POWER_OF_TWO(a, 4); 57 58 dc_value = _mm_set1_epi16(a); 59 60 // Reconstruction and Store 61 d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); 62 d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); 63 d[0] = _mm_unpacklo_epi32(d[0], 64 _mm_cvtsi32_si128(*(const int *)(dest + stride))); 65 d[1] = _mm_unpacklo_epi32( 66 _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); 67 d[0] = _mm_unpacklo_epi8(d[0], zero); 68 d[1] = _mm_unpacklo_epi8(d[1], zero); 69 d[0] = _mm_add_epi16(d[0], dc_value); 70 d[1] = _mm_add_epi16(d[1], dc_value); 71 d[0] = _mm_packus_epi16(d[0], d[1]); 72 73 *(int *)dest = _mm_cvtsi128_si32(d[0]); 74 d[0] = _mm_srli_si128(d[0], 4); 75 *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); 76 d[0] = _mm_srli_si128(d[0], 4); 77 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); 78 d[0] = _mm_srli_si128(d[0], 4); 79 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); 80 } 81 82 void idct4_sse2(__m128i *const in) { 83 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 84 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 85 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 86 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 87 __m128i u[2]; 88 89 transpose_16bit_4(in); 90 // stage 1 91 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 92 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 93 u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]); 94 u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]); 95 96 // stage 2 97 in[0] = _mm_add_epi16(u[0], u[1]); 98 in[1] = _mm_sub_epi16(u[0], u[1]); 99 in[1] = _mm_shuffle_epi32(in[1], 0x4E); 100 } 101 102 void iadst4_sse2(__m128i *const in) { 103 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 104 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 105 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 106 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 107 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 108 const __m128i kZero = _mm_set1_epi16(0); 109 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 110 __m128i u[8], v[8], in7; 111 112 transpose_16bit_4(in); 113 in7 = _mm_srli_si128(in[1], 8); 114 in7 = _mm_add_epi16(in7, in[0]); 115 in7 = _mm_sub_epi16(in7, in[1]); 116 117 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 118 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 119 u[2] = _mm_unpacklo_epi16(in7, kZero); 120 u[3] = _mm_unpackhi_epi16(in[0], kZero); 121 122 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 123 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 124 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 125 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 126 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 127 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 128 129 u[0] = _mm_add_epi32(v[0], v[1]); 130 u[1] = _mm_add_epi32(v[3], v[4]); 131 u[2] = v[2]; 132 u[3] = _mm_add_epi32(u[0], u[1]); 133 u[4] = _mm_slli_epi32(v[5], 2); 134 u[5] = _mm_add_epi32(u[3], v[5]); 135 u[6] = _mm_sub_epi32(u[5], u[4]); 136 137 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 138 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 139 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 140 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 141 142 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 143 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 144 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 145 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 146 147 in[0] = _mm_packs_epi32(u[0], u[1]); 148 in[1] = _mm_packs_epi32(u[2], u[3]); 149 } 150 151 static INLINE void load_buffer_8x8(const tran_low_t *const input, 152 __m128i *const in) { 153 in[0] = load_input_data8(input + 0 * 8); 154 in[1] = load_input_data8(input + 1 * 8); 155 in[2] = load_input_data8(input + 2 * 8); 156 in[3] = load_input_data8(input + 3 * 8); 157 in[4] = load_input_data8(input + 4 * 8); 158 in[5] = load_input_data8(input + 5 * 8); 159 in[6] = load_input_data8(input + 6 * 8); 160 in[7] = load_input_data8(input + 7 * 8); 161 } 162 163 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, 164 int stride) { 165 __m128i in[8]; 166 int i; 167 168 // Load input data. 169 load_buffer_8x8(input, in); 170 171 // 2-D 172 for (i = 0; i < 2; i++) { 173 idct8_sse2(in); 174 } 175 176 write_buffer_8x8(in, dest, stride); 177 } 178 179 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, 180 int stride) { 181 __m128i io[8]; 182 183 io[0] = load_input_data4(input + 0 * 8); 184 io[1] = load_input_data4(input + 1 * 8); 185 io[2] = load_input_data4(input + 2 * 8); 186 io[3] = load_input_data4(input + 3 * 8); 187 188 idct8x8_12_add_kernel_sse2(io); 189 write_buffer_8x8(io, dest, stride); 190 } 191 192 static INLINE void recon_and_store_8_dual(uint8_t *const dest, 193 const __m128i in_x, 194 const int stride) { 195 const __m128i zero = _mm_setzero_si128(); 196 __m128i d0, d1; 197 198 d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride)); 199 d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride)); 200 d0 = _mm_unpacklo_epi8(d0, zero); 201 d1 = _mm_unpacklo_epi8(d1, zero); 202 d0 = _mm_add_epi16(in_x, d0); 203 d1 = _mm_add_epi16(in_x, d1); 204 d0 = _mm_packus_epi16(d0, d1); 205 _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0); 206 _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0)); 207 } 208 209 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, 210 int stride) { 211 __m128i dc_value; 212 tran_high_t a1; 213 tran_low_t out = 214 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 215 216 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 217 a1 = ROUND_POWER_OF_TWO(out, 5); 218 dc_value = _mm_set1_epi16((int16_t)a1); 219 220 recon_and_store_8_dual(dest, dc_value, stride); 221 dest += 2 * stride; 222 recon_and_store_8_dual(dest, dc_value, stride); 223 dest += 2 * stride; 224 recon_and_store_8_dual(dest, dc_value, stride); 225 dest += 2 * stride; 226 recon_and_store_8_dual(dest, dc_value, stride); 227 } 228 229 void idct8_sse2(__m128i *const in) { 230 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 231 transpose_16bit_8x8(in, in); 232 233 // 4-stage 1D idct8x8 234 idct8(in, in); 235 } 236 237 void iadst8_sse2(__m128i *const in) { 238 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 239 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 240 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 241 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 242 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 243 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 244 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 245 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 246 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 247 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 248 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 249 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 250 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 251 const __m128i k__const_0 = _mm_set1_epi16(0); 252 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 253 254 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 255 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 256 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 257 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 258 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 259 260 // transpose 261 transpose_16bit_8x8(in, in); 262 263 // properly aligned for butterfly input 264 in0 = in[7]; 265 in1 = in[0]; 266 in2 = in[5]; 267 in3 = in[2]; 268 in4 = in[3]; 269 in5 = in[4]; 270 in6 = in[1]; 271 in7 = in[6]; 272 273 // column transformation 274 // stage 1 275 // interleave and multiply/add into 32-bit integer 276 s0 = _mm_unpacklo_epi16(in0, in1); 277 s1 = _mm_unpackhi_epi16(in0, in1); 278 s2 = _mm_unpacklo_epi16(in2, in3); 279 s3 = _mm_unpackhi_epi16(in2, in3); 280 s4 = _mm_unpacklo_epi16(in4, in5); 281 s5 = _mm_unpackhi_epi16(in4, in5); 282 s6 = _mm_unpacklo_epi16(in6, in7); 283 s7 = _mm_unpackhi_epi16(in6, in7); 284 285 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 286 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 287 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 288 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 289 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 290 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 291 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 292 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 293 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 294 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 295 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 296 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 297 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 298 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 299 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 300 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 301 302 // addition 303 w0 = _mm_add_epi32(u0, u8); 304 w1 = _mm_add_epi32(u1, u9); 305 w2 = _mm_add_epi32(u2, u10); 306 w3 = _mm_add_epi32(u3, u11); 307 w4 = _mm_add_epi32(u4, u12); 308 w5 = _mm_add_epi32(u5, u13); 309 w6 = _mm_add_epi32(u6, u14); 310 w7 = _mm_add_epi32(u7, u15); 311 w8 = _mm_sub_epi32(u0, u8); 312 w9 = _mm_sub_epi32(u1, u9); 313 w10 = _mm_sub_epi32(u2, u10); 314 w11 = _mm_sub_epi32(u3, u11); 315 w12 = _mm_sub_epi32(u4, u12); 316 w13 = _mm_sub_epi32(u5, u13); 317 w14 = _mm_sub_epi32(u6, u14); 318 w15 = _mm_sub_epi32(u7, u15); 319 320 // shift and rounding 321 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 322 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 323 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 324 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 325 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 326 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 327 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 328 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 329 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 330 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 331 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 332 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 333 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 334 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 335 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 336 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 337 338 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 339 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 340 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 341 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 342 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 343 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 344 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 345 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 346 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 347 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 348 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 349 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 350 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 351 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 352 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 353 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 354 355 // back to 16-bit and pack 8 integers into __m128i 356 in[0] = _mm_packs_epi32(u0, u1); 357 in[1] = _mm_packs_epi32(u2, u3); 358 in[2] = _mm_packs_epi32(u4, u5); 359 in[3] = _mm_packs_epi32(u6, u7); 360 in[4] = _mm_packs_epi32(u8, u9); 361 in[5] = _mm_packs_epi32(u10, u11); 362 in[6] = _mm_packs_epi32(u12, u13); 363 in[7] = _mm_packs_epi32(u14, u15); 364 365 // stage 2 366 s0 = _mm_add_epi16(in[0], in[2]); 367 s1 = _mm_add_epi16(in[1], in[3]); 368 s2 = _mm_sub_epi16(in[0], in[2]); 369 s3 = _mm_sub_epi16(in[1], in[3]); 370 u0 = _mm_unpacklo_epi16(in[4], in[5]); 371 u1 = _mm_unpackhi_epi16(in[4], in[5]); 372 u2 = _mm_unpacklo_epi16(in[6], in[7]); 373 u3 = _mm_unpackhi_epi16(in[6], in[7]); 374 375 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 376 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 377 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 378 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 379 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 380 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 381 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 382 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 383 384 w0 = _mm_add_epi32(v0, v4); 385 w1 = _mm_add_epi32(v1, v5); 386 w2 = _mm_add_epi32(v2, v6); 387 w3 = _mm_add_epi32(v3, v7); 388 w4 = _mm_sub_epi32(v0, v4); 389 w5 = _mm_sub_epi32(v1, v5); 390 w6 = _mm_sub_epi32(v2, v6); 391 w7 = _mm_sub_epi32(v3, v7); 392 393 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 394 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 395 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 396 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 397 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 398 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 399 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 400 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 401 402 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 403 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 404 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 405 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 406 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 407 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 408 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 409 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 410 411 // back to 16-bit intergers 412 s4 = _mm_packs_epi32(u0, u1); 413 s5 = _mm_packs_epi32(u2, u3); 414 s6 = _mm_packs_epi32(u4, u5); 415 s7 = _mm_packs_epi32(u6, u7); 416 417 // stage 3 418 u0 = _mm_unpacklo_epi16(s2, s3); 419 u1 = _mm_unpackhi_epi16(s2, s3); 420 u2 = _mm_unpacklo_epi16(s6, s7); 421 u3 = _mm_unpackhi_epi16(s6, s7); 422 423 s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16); 424 s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16); 425 s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16); 426 s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16); 427 428 in[0] = s0; 429 in[1] = _mm_sub_epi16(k__const_0, s4); 430 in[2] = s6; 431 in[3] = _mm_sub_epi16(k__const_0, s2); 432 in[4] = s3; 433 in[5] = _mm_sub_epi16(k__const_0, s7); 434 in[6] = s5; 435 in[7] = _mm_sub_epi16(k__const_0, s1); 436 } 437 438 static INLINE void idct16_load8x8(const tran_low_t *const input, 439 __m128i *const in) { 440 in[0] = load_input_data8(input + 0 * 16); 441 in[1] = load_input_data8(input + 1 * 16); 442 in[2] = load_input_data8(input + 2 * 16); 443 in[3] = load_input_data8(input + 3 * 16); 444 in[4] = load_input_data8(input + 4 * 16); 445 in[5] = load_input_data8(input + 5 * 16); 446 in[6] = load_input_data8(input + 6 * 16); 447 in[7] = load_input_data8(input + 7 * 16); 448 } 449 450 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, 451 int stride) { 452 __m128i l[16], r[16], out[16], *in; 453 int i; 454 455 in = l; 456 for (i = 0; i < 2; i++) { 457 idct16_load8x8(input, in); 458 transpose_16bit_8x8(in, in); 459 idct16_load8x8(input + 8, in + 8); 460 transpose_16bit_8x8(in + 8, in + 8); 461 idct16_8col(in, in); 462 in = r; 463 input += 128; 464 } 465 466 for (i = 0; i < 16; i += 8) { 467 int j; 468 transpose_16bit_8x8(l + i, out); 469 transpose_16bit_8x8(r + i, out + 8); 470 idct16_8col(out, out); 471 472 for (j = 0; j < 16; ++j) { 473 write_buffer_8x1(dest + j * stride, out[j]); 474 } 475 476 dest += 8; 477 } 478 } 479 480 void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, 481 int stride) { 482 __m128i in[16], temp[16], out[16]; 483 int i; 484 485 idct16_load8x8(input, in); 486 transpose_16bit_8x8(in, in); 487 488 for (i = 8; i < 16; i++) { 489 in[i] = _mm_setzero_si128(); 490 } 491 idct16_8col(in, temp); 492 493 for (i = 0; i < 16; i += 8) { 494 int j; 495 transpose_16bit_8x8(temp + i, in); 496 idct16_8col(in, out); 497 498 for (j = 0; j < 16; ++j) { 499 write_buffer_8x1(dest + j * stride, out[j]); 500 } 501 502 dest += 8; 503 } 504 } 505 506 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, 507 int stride) { 508 __m128i in[16], l[16]; 509 int i; 510 511 // First 1-D inverse DCT 512 // Load input data. 513 in[0] = load_input_data4(input + 0 * 16); 514 in[1] = load_input_data4(input + 1 * 16); 515 in[2] = load_input_data4(input + 2 * 16); 516 in[3] = load_input_data4(input + 3 * 16); 517 518 idct16x16_10_pass1(in, l); 519 520 // Second 1-D inverse transform, performed per 8x16 block 521 for (i = 0; i < 16; i += 8) { 522 int j; 523 idct16x16_10_pass2(l + i, in); 524 525 for (j = 0; j < 16; ++j) { 526 write_buffer_8x1(dest + j * stride, in[j]); 527 } 528 529 dest += 8; 530 } 531 } 532 533 static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) { 534 const __m128i zero = _mm_setzero_si128(); 535 __m128i d0, d1; 536 537 d0 = _mm_load_si128((__m128i *)(dest)); 538 d1 = _mm_unpackhi_epi8(d0, zero); 539 d0 = _mm_unpacklo_epi8(d0, zero); 540 d0 = _mm_add_epi16(in_x, d0); 541 d1 = _mm_add_epi16(in_x, d1); 542 d0 = _mm_packus_epi16(d0, d1); 543 _mm_store_si128((__m128i *)(dest), d0); 544 } 545 546 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, 547 int stride) { 548 __m128i dc_value; 549 int i; 550 tran_high_t a1; 551 tran_low_t out = 552 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 553 554 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 555 a1 = ROUND_POWER_OF_TWO(out, 6); 556 dc_value = _mm_set1_epi16((int16_t)a1); 557 558 for (i = 0; i < 16; ++i) { 559 recon_and_store_16(dest, dc_value); 560 dest += stride; 561 } 562 } 563 564 static void iadst16_8col(__m128i *const in) { 565 // perform 16x16 1-D ADST for 8 columns 566 __m128i s[16], x[16], u[32], v[32]; 567 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 568 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 569 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 570 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 571 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 572 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 573 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 574 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 575 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 576 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 577 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 578 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 579 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 580 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 581 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 582 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 583 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 584 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 585 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 586 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 587 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 588 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 589 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 590 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 591 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 592 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 593 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 594 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 595 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 596 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 597 const __m128i kZero = _mm_set1_epi16(0); 598 599 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 600 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 601 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 602 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 603 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 604 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 605 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 606 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 607 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 608 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 609 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 610 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 611 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 612 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 613 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 614 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 615 616 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 617 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 618 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 619 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 620 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 621 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 622 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 623 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 624 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 625 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 626 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 627 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 628 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 629 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 630 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 631 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 632 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 633 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 634 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 635 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 636 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 637 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 638 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 639 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 640 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 641 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 642 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 643 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 644 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 645 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 646 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 647 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 648 649 u[0] = _mm_add_epi32(v[0], v[16]); 650 u[1] = _mm_add_epi32(v[1], v[17]); 651 u[2] = _mm_add_epi32(v[2], v[18]); 652 u[3] = _mm_add_epi32(v[3], v[19]); 653 u[4] = _mm_add_epi32(v[4], v[20]); 654 u[5] = _mm_add_epi32(v[5], v[21]); 655 u[6] = _mm_add_epi32(v[6], v[22]); 656 u[7] = _mm_add_epi32(v[7], v[23]); 657 u[8] = _mm_add_epi32(v[8], v[24]); 658 u[9] = _mm_add_epi32(v[9], v[25]); 659 u[10] = _mm_add_epi32(v[10], v[26]); 660 u[11] = _mm_add_epi32(v[11], v[27]); 661 u[12] = _mm_add_epi32(v[12], v[28]); 662 u[13] = _mm_add_epi32(v[13], v[29]); 663 u[14] = _mm_add_epi32(v[14], v[30]); 664 u[15] = _mm_add_epi32(v[15], v[31]); 665 u[16] = _mm_sub_epi32(v[0], v[16]); 666 u[17] = _mm_sub_epi32(v[1], v[17]); 667 u[18] = _mm_sub_epi32(v[2], v[18]); 668 u[19] = _mm_sub_epi32(v[3], v[19]); 669 u[20] = _mm_sub_epi32(v[4], v[20]); 670 u[21] = _mm_sub_epi32(v[5], v[21]); 671 u[22] = _mm_sub_epi32(v[6], v[22]); 672 u[23] = _mm_sub_epi32(v[7], v[23]); 673 u[24] = _mm_sub_epi32(v[8], v[24]); 674 u[25] = _mm_sub_epi32(v[9], v[25]); 675 u[26] = _mm_sub_epi32(v[10], v[26]); 676 u[27] = _mm_sub_epi32(v[11], v[27]); 677 u[28] = _mm_sub_epi32(v[12], v[28]); 678 u[29] = _mm_sub_epi32(v[13], v[29]); 679 u[30] = _mm_sub_epi32(v[14], v[30]); 680 u[31] = _mm_sub_epi32(v[15], v[31]); 681 682 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 683 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 684 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 685 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 686 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 687 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 688 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 689 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 690 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 691 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 692 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 693 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 694 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 695 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 696 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 697 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 698 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 699 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 700 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 701 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 702 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 703 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 704 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 705 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 706 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 707 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 708 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 709 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 710 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 711 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 712 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 713 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 714 715 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 716 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 717 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 718 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 719 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 720 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 721 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 722 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 723 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 724 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 725 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 726 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 727 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 728 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 729 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 730 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 731 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 732 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 733 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 734 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 735 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 736 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 737 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 738 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 739 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 740 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 741 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 742 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 743 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 744 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 745 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 746 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 747 748 s[0] = _mm_packs_epi32(u[0], u[1]); 749 s[1] = _mm_packs_epi32(u[2], u[3]); 750 s[2] = _mm_packs_epi32(u[4], u[5]); 751 s[3] = _mm_packs_epi32(u[6], u[7]); 752 s[4] = _mm_packs_epi32(u[8], u[9]); 753 s[5] = _mm_packs_epi32(u[10], u[11]); 754 s[6] = _mm_packs_epi32(u[12], u[13]); 755 s[7] = _mm_packs_epi32(u[14], u[15]); 756 s[8] = _mm_packs_epi32(u[16], u[17]); 757 s[9] = _mm_packs_epi32(u[18], u[19]); 758 s[10] = _mm_packs_epi32(u[20], u[21]); 759 s[11] = _mm_packs_epi32(u[22], u[23]); 760 s[12] = _mm_packs_epi32(u[24], u[25]); 761 s[13] = _mm_packs_epi32(u[26], u[27]); 762 s[14] = _mm_packs_epi32(u[28], u[29]); 763 s[15] = _mm_packs_epi32(u[30], u[31]); 764 765 // stage 2 766 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 767 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 768 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 769 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 770 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 771 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 772 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 773 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 774 775 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 776 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 777 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 778 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 779 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 780 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 781 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 782 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 783 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 784 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 785 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 786 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 787 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 788 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 789 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 790 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 791 792 u[0] = _mm_add_epi32(v[0], v[8]); 793 u[1] = _mm_add_epi32(v[1], v[9]); 794 u[2] = _mm_add_epi32(v[2], v[10]); 795 u[3] = _mm_add_epi32(v[3], v[11]); 796 u[4] = _mm_add_epi32(v[4], v[12]); 797 u[5] = _mm_add_epi32(v[5], v[13]); 798 u[6] = _mm_add_epi32(v[6], v[14]); 799 u[7] = _mm_add_epi32(v[7], v[15]); 800 u[8] = _mm_sub_epi32(v[0], v[8]); 801 u[9] = _mm_sub_epi32(v[1], v[9]); 802 u[10] = _mm_sub_epi32(v[2], v[10]); 803 u[11] = _mm_sub_epi32(v[3], v[11]); 804 u[12] = _mm_sub_epi32(v[4], v[12]); 805 u[13] = _mm_sub_epi32(v[5], v[13]); 806 u[14] = _mm_sub_epi32(v[6], v[14]); 807 u[15] = _mm_sub_epi32(v[7], v[15]); 808 809 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 810 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 811 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 812 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 813 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 814 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 815 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 816 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 817 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 818 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 819 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 820 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 821 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 822 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 823 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 824 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 825 826 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 827 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 828 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 829 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 830 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 831 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 832 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 833 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 834 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 835 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 836 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 837 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 838 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 839 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 840 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 841 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 842 843 x[0] = _mm_add_epi16(s[0], s[4]); 844 x[1] = _mm_add_epi16(s[1], s[5]); 845 x[2] = _mm_add_epi16(s[2], s[6]); 846 x[3] = _mm_add_epi16(s[3], s[7]); 847 x[4] = _mm_sub_epi16(s[0], s[4]); 848 x[5] = _mm_sub_epi16(s[1], s[5]); 849 x[6] = _mm_sub_epi16(s[2], s[6]); 850 x[7] = _mm_sub_epi16(s[3], s[7]); 851 x[8] = _mm_packs_epi32(u[0], u[1]); 852 x[9] = _mm_packs_epi32(u[2], u[3]); 853 x[10] = _mm_packs_epi32(u[4], u[5]); 854 x[11] = _mm_packs_epi32(u[6], u[7]); 855 x[12] = _mm_packs_epi32(u[8], u[9]); 856 x[13] = _mm_packs_epi32(u[10], u[11]); 857 x[14] = _mm_packs_epi32(u[12], u[13]); 858 x[15] = _mm_packs_epi32(u[14], u[15]); 859 860 // stage 3 861 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 862 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 863 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 864 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 865 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 866 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 867 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 868 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 869 870 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 871 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 872 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 873 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 874 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 875 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 876 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 877 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 878 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 879 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 880 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 881 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 882 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 883 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 884 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 885 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 886 887 u[0] = _mm_add_epi32(v[0], v[4]); 888 u[1] = _mm_add_epi32(v[1], v[5]); 889 u[2] = _mm_add_epi32(v[2], v[6]); 890 u[3] = _mm_add_epi32(v[3], v[7]); 891 u[4] = _mm_sub_epi32(v[0], v[4]); 892 u[5] = _mm_sub_epi32(v[1], v[5]); 893 u[6] = _mm_sub_epi32(v[2], v[6]); 894 u[7] = _mm_sub_epi32(v[3], v[7]); 895 u[8] = _mm_add_epi32(v[8], v[12]); 896 u[9] = _mm_add_epi32(v[9], v[13]); 897 u[10] = _mm_add_epi32(v[10], v[14]); 898 u[11] = _mm_add_epi32(v[11], v[15]); 899 u[12] = _mm_sub_epi32(v[8], v[12]); 900 u[13] = _mm_sub_epi32(v[9], v[13]); 901 u[14] = _mm_sub_epi32(v[10], v[14]); 902 u[15] = _mm_sub_epi32(v[11], v[15]); 903 904 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 905 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 906 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 907 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 908 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 909 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 910 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 911 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 912 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 913 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 914 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 915 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 916 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 917 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 918 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 919 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 920 921 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 922 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 923 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 924 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 925 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 926 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 927 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 928 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 929 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 930 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 931 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 932 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 933 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 934 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 935 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 936 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 937 938 s[0] = _mm_add_epi16(x[0], x[2]); 939 s[1] = _mm_add_epi16(x[1], x[3]); 940 s[2] = _mm_sub_epi16(x[0], x[2]); 941 s[3] = _mm_sub_epi16(x[1], x[3]); 942 s[4] = _mm_packs_epi32(v[0], v[1]); 943 s[5] = _mm_packs_epi32(v[2], v[3]); 944 s[6] = _mm_packs_epi32(v[4], v[5]); 945 s[7] = _mm_packs_epi32(v[6], v[7]); 946 s[8] = _mm_add_epi16(x[8], x[10]); 947 s[9] = _mm_add_epi16(x[9], x[11]); 948 s[10] = _mm_sub_epi16(x[8], x[10]); 949 s[11] = _mm_sub_epi16(x[9], x[11]); 950 s[12] = _mm_packs_epi32(v[8], v[9]); 951 s[13] = _mm_packs_epi32(v[10], v[11]); 952 s[14] = _mm_packs_epi32(v[12], v[13]); 953 s[15] = _mm_packs_epi32(v[14], v[15]); 954 955 // stage 4 956 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 957 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 958 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 959 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 960 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 961 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 962 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 963 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 964 965 in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16); 966 in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); 967 in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); 968 in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16); 969 in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16); 970 in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16); 971 in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16); 972 in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16); 973 974 in[0] = s[0]; 975 in[1] = _mm_sub_epi16(kZero, s[8]); 976 in[2] = s[12]; 977 in[3] = _mm_sub_epi16(kZero, s[4]); 978 in[12] = s[5]; 979 in[13] = _mm_sub_epi16(kZero, s[13]); 980 in[14] = s[9]; 981 in[15] = _mm_sub_epi16(kZero, s[1]); 982 } 983 984 void idct16_sse2(__m128i *const in0, __m128i *const in1) { 985 transpose_16bit_16x16(in0, in1); 986 idct16_8col(in0, in0); 987 idct16_8col(in1, in1); 988 } 989 990 void iadst16_sse2(__m128i *const in0, __m128i *const in1) { 991 transpose_16bit_16x16(in0, in1); 992 iadst16_8col(in0); 993 iadst16_8col(in1); 994 } 995 996 // Group the coefficient calculation into smaller functions to prevent stack 997 // spillover in 32x32 idct optimizations: 998 // quarter_1: 0-7 999 // quarter_2: 8-15 1000 // quarter_3_4: 16-23, 24-31 1001 1002 // For each 8x32 block __m128i in[32], 1003 // Input with index, 0, 4 1004 // output pixels: 0-7 in __m128i out[32] 1005 static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, 1006 __m128i *const out /*out[8]*/) { 1007 const __m128i zero = _mm_setzero_si128(); 1008 __m128i step1[8], step2[8]; 1009 1010 // stage 3 1011 butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); 1012 1013 // stage 4 1014 step2[0] = butterfly_cospi16(in[0]); 1015 step2[4] = step1[4]; 1016 step2[5] = step1[4]; 1017 step2[6] = step1[7]; 1018 step2[7] = step1[7]; 1019 1020 // stage 5 1021 step1[0] = step2[0]; 1022 step1[1] = step2[0]; 1023 step1[2] = step2[0]; 1024 step1[3] = step2[0]; 1025 step1[4] = step2[4]; 1026 butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); 1027 step1[7] = step2[7]; 1028 1029 // stage 6 1030 out[0] = _mm_add_epi16(step1[0], step1[7]); 1031 out[1] = _mm_add_epi16(step1[1], step1[6]); 1032 out[2] = _mm_add_epi16(step1[2], step1[5]); 1033 out[3] = _mm_add_epi16(step1[3], step1[4]); 1034 out[4] = _mm_sub_epi16(step1[3], step1[4]); 1035 out[5] = _mm_sub_epi16(step1[2], step1[5]); 1036 out[6] = _mm_sub_epi16(step1[1], step1[6]); 1037 out[7] = _mm_sub_epi16(step1[0], step1[7]); 1038 } 1039 1040 // For each 8x32 block __m128i in[32], 1041 // Input with index, 2, 6 1042 // output pixels: 8-15 in __m128i out[32] 1043 static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, 1044 __m128i *const out /*out[16]*/) { 1045 const __m128i zero = _mm_setzero_si128(); 1046 __m128i step1[16], step2[16]; 1047 1048 // stage 2 1049 butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); 1050 butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); 1051 1052 // stage 3 1053 step1[8] = step2[8]; 1054 step1[9] = step2[8]; 1055 step1[14] = step2[15]; 1056 step1[15] = step2[15]; 1057 step1[10] = step2[11]; 1058 step1[11] = step2[11]; 1059 step1[12] = step2[12]; 1060 step1[13] = step2[12]; 1061 1062 idct32_8x32_quarter_2_stage_4_to_6(step1, out); 1063 } 1064 1065 static INLINE void idct32_34_8x32_quarter_1_2( 1066 const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { 1067 __m128i temp[16]; 1068 idct32_34_8x32_quarter_1(in, temp); 1069 idct32_34_8x32_quarter_2(in, temp); 1070 // stage 7 1071 add_sub_butterfly(temp, out, 16); 1072 } 1073 1074 // For each 8x32 block __m128i in[32], 1075 // Input with odd index, 1, 3, 5, 7 1076 // output pixels: 16-23, 24-31 in __m128i out[32] 1077 static INLINE void idct32_34_8x32_quarter_3_4( 1078 const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { 1079 const __m128i zero = _mm_setzero_si128(); 1080 __m128i step1[32]; 1081 1082 // stage 1 1083 butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]); 1084 butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); 1085 butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]); 1086 butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); 1087 1088 // stage 3 1089 butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], 1090 &step1[30]); 1091 butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], 1092 &step1[29]); 1093 butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], 1094 &step1[26]); 1095 butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], 1096 &step1[25]); 1097 1098 idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); 1099 } 1100 1101 void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/, 1102 __m128i *const out /*out[32]*/) { 1103 __m128i temp[32]; 1104 1105 idct32_34_8x32_quarter_1_2(in, temp); 1106 idct32_34_8x32_quarter_3_4(in, temp); 1107 // final stage 1108 add_sub_butterfly(temp, out, 32); 1109 } 1110 1111 // Only upper-left 8x8 has non-zero coeff 1112 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, 1113 int stride) { 1114 __m128i io[32], col[32]; 1115 int i; 1116 1117 // Load input data. Only need to load the top left 8x8 block. 1118 load_transpose_16bit_8x8(input, 32, io); 1119 idct32_34_8x32_sse2(io, col); 1120 1121 for (i = 0; i < 32; i += 8) { 1122 int j; 1123 transpose_16bit_8x8(col + i, io); 1124 idct32_34_8x32_sse2(io, io); 1125 1126 for (j = 0; j < 32; ++j) { 1127 write_buffer_8x1(dest + j * stride, io[j]); 1128 } 1129 1130 dest += 8; 1131 } 1132 } 1133 1134 // For each 8x32 block __m128i in[32], 1135 // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 1136 // output pixels: 0-7 in __m128i out[32] 1137 static INLINE void idct32_1024_8x32_quarter_1( 1138 const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { 1139 __m128i step1[8], step2[8]; 1140 1141 // stage 3 1142 butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); 1143 butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); 1144 1145 // stage 4 1146 butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); 1147 butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); 1148 step2[4] = _mm_add_epi16(step1[4], step1[5]); 1149 step2[5] = _mm_sub_epi16(step1[4], step1[5]); 1150 step2[6] = _mm_sub_epi16(step1[7], step1[6]); 1151 step2[7] = _mm_add_epi16(step1[7], step1[6]); 1152 1153 // stage 5 1154 step1[0] = _mm_add_epi16(step2[0], step2[3]); 1155 step1[1] = _mm_add_epi16(step2[1], step2[2]); 1156 step1[2] = _mm_sub_epi16(step2[1], step2[2]); 1157 step1[3] = _mm_sub_epi16(step2[0], step2[3]); 1158 step1[4] = step2[4]; 1159 butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); 1160 step1[7] = step2[7]; 1161 1162 // stage 6 1163 out[0] = _mm_add_epi16(step1[0], step1[7]); 1164 out[1] = _mm_add_epi16(step1[1], step1[6]); 1165 out[2] = _mm_add_epi16(step1[2], step1[5]); 1166 out[3] = _mm_add_epi16(step1[3], step1[4]); 1167 out[4] = _mm_sub_epi16(step1[3], step1[4]); 1168 out[5] = _mm_sub_epi16(step1[2], step1[5]); 1169 out[6] = _mm_sub_epi16(step1[1], step1[6]); 1170 out[7] = _mm_sub_epi16(step1[0], step1[7]); 1171 } 1172 1173 // For each 8x32 block __m128i in[32], 1174 // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 1175 // output pixels: 8-15 in __m128i out[32] 1176 static INLINE void idct32_1024_8x32_quarter_2( 1177 const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) { 1178 __m128i step1[16], step2[16]; 1179 1180 // stage 2 1181 butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); 1182 butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); 1183 butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); 1184 butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); 1185 1186 // stage 3 1187 step1[8] = _mm_add_epi16(step2[8], step2[9]); 1188 step1[9] = _mm_sub_epi16(step2[8], step2[9]); 1189 step1[10] = _mm_sub_epi16(step2[11], step2[10]); 1190 step1[11] = _mm_add_epi16(step2[11], step2[10]); 1191 step1[12] = _mm_add_epi16(step2[12], step2[13]); 1192 step1[13] = _mm_sub_epi16(step2[12], step2[13]); 1193 step1[14] = _mm_sub_epi16(step2[15], step2[14]); 1194 step1[15] = _mm_add_epi16(step2[15], step2[14]); 1195 1196 idct32_8x32_quarter_2_stage_4_to_6(step1, out); 1197 } 1198 1199 static INLINE void idct32_1024_8x32_quarter_1_2( 1200 const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { 1201 __m128i temp[16]; 1202 idct32_1024_8x32_quarter_1(in, temp); 1203 idct32_1024_8x32_quarter_2(in, temp); 1204 // stage 7 1205 add_sub_butterfly(temp, out, 16); 1206 } 1207 1208 // For each 8x32 block __m128i in[32], 1209 // Input with odd index, 1210 // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 1211 // output pixels: 16-23, 24-31 in __m128i out[32] 1212 static INLINE void idct32_1024_8x32_quarter_3_4( 1213 const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { 1214 __m128i step1[32], step2[32]; 1215 1216 // stage 1 1217 butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); 1218 butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); 1219 butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); 1220 butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); 1221 1222 butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); 1223 butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); 1224 1225 butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); 1226 butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); 1227 1228 // stage 2 1229 step2[16] = _mm_add_epi16(step1[16], step1[17]); 1230 step2[17] = _mm_sub_epi16(step1[16], step1[17]); 1231 step2[18] = _mm_sub_epi16(step1[19], step1[18]); 1232 step2[19] = _mm_add_epi16(step1[19], step1[18]); 1233 step2[20] = _mm_add_epi16(step1[20], step1[21]); 1234 step2[21] = _mm_sub_epi16(step1[20], step1[21]); 1235 step2[22] = _mm_sub_epi16(step1[23], step1[22]); 1236 step2[23] = _mm_add_epi16(step1[23], step1[22]); 1237 1238 step2[24] = _mm_add_epi16(step1[24], step1[25]); 1239 step2[25] = _mm_sub_epi16(step1[24], step1[25]); 1240 step2[26] = _mm_sub_epi16(step1[27], step1[26]); 1241 step2[27] = _mm_add_epi16(step1[27], step1[26]); 1242 step2[28] = _mm_add_epi16(step1[28], step1[29]); 1243 step2[29] = _mm_sub_epi16(step1[28], step1[29]); 1244 step2[30] = _mm_sub_epi16(step1[31], step1[30]); 1245 step2[31] = _mm_add_epi16(step1[31], step1[30]); 1246 1247 // stage 3 1248 step1[16] = step2[16]; 1249 step1[31] = step2[31]; 1250 butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], 1251 &step1[30]); 1252 butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], 1253 &step1[29]); 1254 step1[19] = step2[19]; 1255 step1[20] = step2[20]; 1256 butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], 1257 &step1[26]); 1258 butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], 1259 &step1[25]); 1260 step1[23] = step2[23]; 1261 step1[24] = step2[24]; 1262 step1[27] = step2[27]; 1263 step1[28] = step2[28]; 1264 1265 idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); 1266 } 1267 1268 void idct32_1024_8x32(const __m128i *const in /*in[32]*/, 1269 __m128i *const out /*out[32]*/) { 1270 __m128i temp[32]; 1271 1272 idct32_1024_8x32_quarter_1_2(in, temp); 1273 idct32_1024_8x32_quarter_3_4(in, temp); 1274 // final stage 1275 add_sub_butterfly(temp, out, 32); 1276 } 1277 1278 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, 1279 int stride) { 1280 __m128i col[4][32], io[32]; 1281 int i; 1282 1283 // rows 1284 for (i = 0; i < 4; i++) { 1285 load_transpose_16bit_8x8(&input[0], 32, &io[0]); 1286 load_transpose_16bit_8x8(&input[8], 32, &io[8]); 1287 load_transpose_16bit_8x8(&input[16], 32, &io[16]); 1288 load_transpose_16bit_8x8(&input[24], 32, &io[24]); 1289 idct32_1024_8x32(io, col[i]); 1290 input += 32 << 3; 1291 } 1292 1293 // columns 1294 for (i = 0; i < 32; i += 8) { 1295 // Transpose 32x8 block to 8x32 block 1296 transpose_16bit_8x8(col[0] + i, io); 1297 transpose_16bit_8x8(col[1] + i, io + 8); 1298 transpose_16bit_8x8(col[2] + i, io + 16); 1299 transpose_16bit_8x8(col[3] + i, io + 24); 1300 1301 idct32_1024_8x32(io, io); 1302 store_buffer_8x32(io, dest, stride); 1303 dest += 8; 1304 } 1305 } 1306 1307 void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, 1308 int stride) { 1309 __m128i col[2][32], in[32], out[32]; 1310 int i; 1311 1312 for (i = 16; i < 32; i++) { 1313 in[i] = _mm_setzero_si128(); 1314 } 1315 1316 // rows 1317 for (i = 0; i < 2; i++) { 1318 load_transpose_16bit_8x8(&input[0], 32, &in[0]); 1319 load_transpose_16bit_8x8(&input[8], 32, &in[8]); 1320 idct32_1024_8x32(in, col[i]); 1321 input += 32 << 3; 1322 } 1323 1324 // columns 1325 for (i = 0; i < 32; i += 8) { 1326 transpose_16bit_8x8(col[0] + i, in); 1327 transpose_16bit_8x8(col[1] + i, in + 8); 1328 idct32_1024_8x32(in, out); 1329 store_buffer_8x32(out, dest, stride); 1330 dest += 8; 1331 } 1332 } 1333 1334 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, 1335 int stride) { 1336 __m128i dc_value; 1337 int j; 1338 tran_high_t a1; 1339 tran_low_t out = 1340 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 1341 1342 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1343 a1 = ROUND_POWER_OF_TWO(out, 6); 1344 dc_value = _mm_set1_epi16((int16_t)a1); 1345 1346 for (j = 0; j < 32; ++j) { 1347 recon_and_store_16(dest + j * stride + 0, dc_value); 1348 recon_and_store_16(dest + j * stride + 16, dc_value); 1349 } 1350 } 1351