1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <emmintrin.h> // SSE2 13 14 #include "./vp9_rtcd.h" 15 #include "./vpx_dsp_rtcd.h" 16 #include "vpx_dsp/txfm_common.h" 17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 18 #include "vpx_dsp/x86/fwd_txfm_sse2.h" 19 #include "vpx_dsp/x86/transpose_sse2.h" 20 #include "vpx_dsp/x86/txfm_common_sse2.h" 21 #include "vpx_ports/mem.h" 22 23 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 24 int stride) { 25 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 26 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 27 __m128i mask; 28 29 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 30 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 31 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 32 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 33 34 in[0] = _mm_slli_epi16(in[0], 4); 35 in[1] = _mm_slli_epi16(in[1], 4); 36 in[2] = _mm_slli_epi16(in[2], 4); 37 in[3] = _mm_slli_epi16(in[3], 4); 38 39 mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 40 in[0] = _mm_add_epi16(in[0], mask); 41 in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 42 } 43 44 static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { 45 const __m128i kOne = _mm_set1_epi16(1); 46 __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 47 __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 48 __m128i out01 = _mm_add_epi16(in01, kOne); 49 __m128i out23 = _mm_add_epi16(in23, kOne); 50 out01 = _mm_srai_epi16(out01, 2); 51 out23 = _mm_srai_epi16(out23, 2); 52 store_output(&out01, (output + 0 * 8)); 53 store_output(&out23, (output + 1 * 8)); 54 } 55 56 static INLINE void transpose_4x4(__m128i *res) { 57 // Combine and transpose 58 // 00 01 02 03 20 21 22 23 59 // 10 11 12 13 30 31 32 33 60 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 61 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 62 63 // 00 10 01 11 02 12 03 13 64 // 20 30 21 31 22 32 23 33 65 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 66 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 67 68 // 00 10 20 30 01 11 21 31 69 // 02 12 22 32 03 13 23 33 70 // only use the first 4 16-bit integers 71 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 72 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 73 } 74 75 static void fdct4_sse2(__m128i *in) { 76 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 77 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 78 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 79 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 80 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 81 82 __m128i u[4], v[4]; 83 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 84 u[1] = _mm_unpacklo_epi16(in[3], in[2]); 85 86 v[0] = _mm_add_epi16(u[0], u[1]); 87 v[1] = _mm_sub_epi16(u[0], u[1]); 88 89 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 90 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 91 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 92 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 93 94 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 95 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 96 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 97 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 98 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 99 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 100 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 101 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 102 103 in[0] = _mm_packs_epi32(u[0], u[1]); 104 in[1] = _mm_packs_epi32(u[2], u[3]); 105 transpose_4x4(in); 106 } 107 108 static void fadst4_sse2(__m128i *in) { 109 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 110 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 111 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 112 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 113 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 114 const __m128i kZero = _mm_set1_epi16(0); 115 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 116 __m128i u[8], v[8]; 117 __m128i in7 = _mm_add_epi16(in[0], in[1]); 118 119 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 120 u[1] = _mm_unpacklo_epi16(in[2], in[3]); 121 u[2] = _mm_unpacklo_epi16(in7, kZero); 122 u[3] = _mm_unpacklo_epi16(in[2], kZero); 123 u[4] = _mm_unpacklo_epi16(in[3], kZero); 124 125 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 126 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 127 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 128 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 129 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 130 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 131 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 132 133 u[0] = _mm_add_epi32(v[0], v[1]); 134 u[1] = _mm_sub_epi32(v[2], v[6]); 135 u[2] = _mm_add_epi32(v[3], v[4]); 136 u[3] = _mm_sub_epi32(u[2], u[0]); 137 u[4] = _mm_slli_epi32(v[5], 2); 138 u[5] = _mm_sub_epi32(u[4], v[5]); 139 u[6] = _mm_add_epi32(u[3], u[5]); 140 141 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 142 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 143 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 144 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 145 146 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 147 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 148 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 149 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 150 151 in[0] = _mm_packs_epi32(u[0], u[2]); 152 in[1] = _mm_packs_epi32(u[1], u[3]); 153 transpose_4x4(in); 154 } 155 156 void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, 157 int tx_type) { 158 __m128i in[4]; 159 160 switch (tx_type) { 161 case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break; 162 case ADST_DCT: 163 load_buffer_4x4(input, in, stride); 164 fadst4_sse2(in); 165 fdct4_sse2(in); 166 write_buffer_4x4(output, in); 167 break; 168 case DCT_ADST: 169 load_buffer_4x4(input, in, stride); 170 fdct4_sse2(in); 171 fadst4_sse2(in); 172 write_buffer_4x4(output, in); 173 break; 174 default: 175 assert(tx_type == ADST_ADST); 176 load_buffer_4x4(input, in, stride); 177 fadst4_sse2(in); 178 fadst4_sse2(in); 179 write_buffer_4x4(output, in); 180 break; 181 } 182 } 183 184 void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, 185 tran_low_t *coeff_ptr, intptr_t n_coeffs, 186 int skip_block, const int16_t *round_ptr, 187 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, 188 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 189 uint16_t *eob_ptr, const int16_t *scan, 190 const int16_t *iscan) { 191 __m128i zero; 192 int pass; 193 194 // Constants 195 // When we use them, in one case, they are all the same. In all others 196 // it's a pair of them that we need to repeat four times. This is done 197 // by constructing the 32 bit constant corresponding to that pair. 198 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 199 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 200 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 201 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 202 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 203 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 204 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 205 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 206 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 207 // Load input 208 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 209 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 210 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 211 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 212 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 213 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 214 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 215 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 216 __m128i *in[8]; 217 int index = 0; 218 219 (void)scan; 220 (void)coeff_ptr; 221 222 // Pre-condition input (shift by two) 223 in0 = _mm_slli_epi16(in0, 2); 224 in1 = _mm_slli_epi16(in1, 2); 225 in2 = _mm_slli_epi16(in2, 2); 226 in3 = _mm_slli_epi16(in3, 2); 227 in4 = _mm_slli_epi16(in4, 2); 228 in5 = _mm_slli_epi16(in5, 2); 229 in6 = _mm_slli_epi16(in6, 2); 230 in7 = _mm_slli_epi16(in7, 2); 231 232 in[0] = &in0; 233 in[1] = &in1; 234 in[2] = &in2; 235 in[3] = &in3; 236 in[4] = &in4; 237 in[5] = &in5; 238 in[6] = &in6; 239 in[7] = &in7; 240 241 // We do two passes, first the columns, then the rows. The results of the 242 // first pass are transposed so that the same column code can be reused. The 243 // results of the second pass are also transposed so that the rows (processed 244 // as columns) are put back in row positions. 245 for (pass = 0; pass < 2; pass++) { 246 // To store results of each pass before the transpose. 247 __m128i res0, res1, res2, res3, res4, res5, res6, res7; 248 // Add/subtract 249 const __m128i q0 = _mm_add_epi16(in0, in7); 250 const __m128i q1 = _mm_add_epi16(in1, in6); 251 const __m128i q2 = _mm_add_epi16(in2, in5); 252 const __m128i q3 = _mm_add_epi16(in3, in4); 253 const __m128i q4 = _mm_sub_epi16(in3, in4); 254 const __m128i q5 = _mm_sub_epi16(in2, in5); 255 const __m128i q6 = _mm_sub_epi16(in1, in6); 256 const __m128i q7 = _mm_sub_epi16(in0, in7); 257 // Work on first four results 258 { 259 // Add/subtract 260 const __m128i r0 = _mm_add_epi16(q0, q3); 261 const __m128i r1 = _mm_add_epi16(q1, q2); 262 const __m128i r2 = _mm_sub_epi16(q1, q2); 263 const __m128i r3 = _mm_sub_epi16(q0, q3); 264 // Interleave to do the multiply by constants which gets us into 32bits 265 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 266 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 267 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 268 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 269 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 270 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 271 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 272 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 273 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 274 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 275 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 276 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 277 // dct_const_round_shift 278 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 279 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 280 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 281 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 282 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 283 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 284 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 285 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 286 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 287 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 288 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 289 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 290 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 291 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 292 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 293 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 294 // Combine 295 res0 = _mm_packs_epi32(w0, w1); 296 res4 = _mm_packs_epi32(w2, w3); 297 res2 = _mm_packs_epi32(w4, w5); 298 res6 = _mm_packs_epi32(w6, w7); 299 } 300 // Work on next four results 301 { 302 // Interleave to do the multiply by constants which gets us into 32bits 303 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 304 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 305 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 306 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 307 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 308 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 309 // dct_const_round_shift 310 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 311 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 312 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 313 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 314 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 315 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 316 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 317 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 318 // Combine 319 const __m128i r0 = _mm_packs_epi32(s0, s1); 320 const __m128i r1 = _mm_packs_epi32(s2, s3); 321 // Add/subtract 322 const __m128i x0 = _mm_add_epi16(q4, r0); 323 const __m128i x1 = _mm_sub_epi16(q4, r0); 324 const __m128i x2 = _mm_sub_epi16(q7, r1); 325 const __m128i x3 = _mm_add_epi16(q7, r1); 326 // Interleave to do the multiply by constants which gets us into 32bits 327 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 328 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 329 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 330 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 331 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 332 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 333 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 334 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 335 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 336 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 337 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 338 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 339 // dct_const_round_shift 340 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 341 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 342 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 343 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 344 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 345 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 346 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 347 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 348 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 349 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 350 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 351 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 352 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 353 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 354 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 355 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 356 // Combine 357 res1 = _mm_packs_epi32(w0, w1); 358 res7 = _mm_packs_epi32(w2, w3); 359 res5 = _mm_packs_epi32(w4, w5); 360 res3 = _mm_packs_epi32(w6, w7); 361 } 362 // Transpose the 8x8. 363 { 364 // 00 01 02 03 04 05 06 07 365 // 10 11 12 13 14 15 16 17 366 // 20 21 22 23 24 25 26 27 367 // 30 31 32 33 34 35 36 37 368 // 40 41 42 43 44 45 46 47 369 // 50 51 52 53 54 55 56 57 370 // 60 61 62 63 64 65 66 67 371 // 70 71 72 73 74 75 76 77 372 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 373 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 374 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 375 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 376 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 377 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 378 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 379 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 380 // 00 10 01 11 02 12 03 13 381 // 20 30 21 31 22 32 23 33 382 // 04 14 05 15 06 16 07 17 383 // 24 34 25 35 26 36 27 37 384 // 40 50 41 51 42 52 43 53 385 // 60 70 61 71 62 72 63 73 386 // 54 54 55 55 56 56 57 57 387 // 64 74 65 75 66 76 67 77 388 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 389 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 390 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 391 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 392 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 393 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 394 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 395 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 396 // 00 10 20 30 01 11 21 31 397 // 40 50 60 70 41 51 61 71 398 // 02 12 22 32 03 13 23 33 399 // 42 52 62 72 43 53 63 73 400 // 04 14 24 34 05 15 21 36 401 // 44 54 64 74 45 55 61 76 402 // 06 16 26 36 07 17 27 37 403 // 46 56 66 76 47 57 67 77 404 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 405 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 406 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 407 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 408 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 409 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 410 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 411 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 412 // 00 10 20 30 40 50 60 70 413 // 01 11 21 31 41 51 61 71 414 // 02 12 22 32 42 52 62 72 415 // 03 13 23 33 43 53 63 73 416 // 04 14 24 34 44 54 64 74 417 // 05 15 25 35 45 55 65 75 418 // 06 16 26 36 46 56 66 76 419 // 07 17 27 37 47 57 67 77 420 } 421 } 422 // Post-condition output and store it 423 { 424 // Post-condition (division by two) 425 // division of two 16 bits signed numbers using shifts 426 // n / 2 = (n - (n >> 15)) >> 1 427 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 428 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 429 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 430 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 431 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 432 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 433 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 434 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 435 in0 = _mm_sub_epi16(in0, sign_in0); 436 in1 = _mm_sub_epi16(in1, sign_in1); 437 in2 = _mm_sub_epi16(in2, sign_in2); 438 in3 = _mm_sub_epi16(in3, sign_in3); 439 in4 = _mm_sub_epi16(in4, sign_in4); 440 in5 = _mm_sub_epi16(in5, sign_in5); 441 in6 = _mm_sub_epi16(in6, sign_in6); 442 in7 = _mm_sub_epi16(in7, sign_in7); 443 in0 = _mm_srai_epi16(in0, 1); 444 in1 = _mm_srai_epi16(in1, 1); 445 in2 = _mm_srai_epi16(in2, 1); 446 in3 = _mm_srai_epi16(in3, 1); 447 in4 = _mm_srai_epi16(in4, 1); 448 in5 = _mm_srai_epi16(in5, 1); 449 in6 = _mm_srai_epi16(in6, 1); 450 in7 = _mm_srai_epi16(in7, 1); 451 } 452 453 iscan += n_coeffs; 454 qcoeff_ptr += n_coeffs; 455 dqcoeff_ptr += n_coeffs; 456 n_coeffs = -n_coeffs; 457 zero = _mm_setzero_si128(); 458 459 if (!skip_block) { 460 __m128i eob; 461 __m128i round, quant, dequant; 462 { 463 __m128i coeff0, coeff1; 464 465 // Setup global values 466 { 467 round = _mm_load_si128((const __m128i *)round_ptr); 468 quant = _mm_load_si128((const __m128i *)quant_ptr); 469 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 470 } 471 472 { 473 __m128i coeff0_sign, coeff1_sign; 474 __m128i qcoeff0, qcoeff1; 475 __m128i qtmp0, qtmp1; 476 // Do DC and first 15 AC 477 coeff0 = *in[0]; 478 coeff1 = *in[1]; 479 480 // Poor man's sign extract 481 coeff0_sign = _mm_srai_epi16(coeff0, 15); 482 coeff1_sign = _mm_srai_epi16(coeff1, 15); 483 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 484 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 485 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 486 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 487 488 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 489 round = _mm_unpackhi_epi64(round, round); 490 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 491 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 492 quant = _mm_unpackhi_epi64(quant, quant); 493 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 494 495 // Reinsert signs 496 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 497 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 498 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 499 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 500 501 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 502 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 503 504 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 505 dequant = _mm_unpackhi_epi64(dequant, dequant); 506 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 507 508 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 509 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 510 } 511 512 { 513 // Scan for eob 514 __m128i zero_coeff0, zero_coeff1; 515 __m128i nzero_coeff0, nzero_coeff1; 516 __m128i iscan0, iscan1; 517 __m128i eob1; 518 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 519 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 520 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 521 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 522 iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); 523 iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); 524 // Add one to convert from indices to counts 525 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 526 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 527 eob = _mm_and_si128(iscan0, nzero_coeff0); 528 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 529 eob = _mm_max_epi16(eob, eob1); 530 } 531 n_coeffs += 8 * 2; 532 } 533 534 // AC only loop 535 index = 2; 536 while (n_coeffs < 0) { 537 __m128i coeff0, coeff1; 538 { 539 __m128i coeff0_sign, coeff1_sign; 540 __m128i qcoeff0, qcoeff1; 541 __m128i qtmp0, qtmp1; 542 543 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); 544 coeff0 = *in[index]; 545 coeff1 = *in[index + 1]; 546 547 // Poor man's sign extract 548 coeff0_sign = _mm_srai_epi16(coeff0, 15); 549 coeff1_sign = _mm_srai_epi16(coeff1, 15); 550 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 551 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 552 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 553 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 554 555 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 556 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 557 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 558 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 559 560 // Reinsert signs 561 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 562 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 563 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 564 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 565 566 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 567 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 568 569 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 570 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 571 572 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 573 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 574 } 575 576 { 577 // Scan for eob 578 __m128i zero_coeff0, zero_coeff1; 579 __m128i nzero_coeff0, nzero_coeff1; 580 __m128i iscan0, iscan1; 581 __m128i eob0, eob1; 582 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 583 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 584 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 585 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 586 iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); 587 iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); 588 // Add one to convert from indices to counts 589 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 590 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 591 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 592 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 593 eob0 = _mm_max_epi16(eob0, eob1); 594 eob = _mm_max_epi16(eob, eob0); 595 } 596 n_coeffs += 8 * 2; 597 index += 2; 598 } 599 600 // Accumulate EOB 601 { 602 __m128i eob_shuffled; 603 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 604 eob = _mm_max_epi16(eob, eob_shuffled); 605 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 606 eob = _mm_max_epi16(eob, eob_shuffled); 607 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 608 eob = _mm_max_epi16(eob, eob_shuffled); 609 *eob_ptr = _mm_extract_epi16(eob, 1); 610 } 611 } else { 612 do { 613 store_tran_low(zero, qcoeff_ptr + n_coeffs); 614 store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); 615 store_tran_low(zero, dqcoeff_ptr + n_coeffs); 616 store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); 617 n_coeffs += 8 * 2; 618 } while (n_coeffs < 0); 619 *eob_ptr = 0; 620 } 621 } 622 623 // load 8x8 array 624 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, 625 int stride) { 626 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 627 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 628 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 629 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 630 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 631 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 632 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 633 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 634 635 in[0] = _mm_slli_epi16(in[0], 2); 636 in[1] = _mm_slli_epi16(in[1], 2); 637 in[2] = _mm_slli_epi16(in[2], 2); 638 in[3] = _mm_slli_epi16(in[3], 2); 639 in[4] = _mm_slli_epi16(in[4], 2); 640 in[5] = _mm_slli_epi16(in[5], 2); 641 in[6] = _mm_slli_epi16(in[6], 2); 642 in[7] = _mm_slli_epi16(in[7], 2); 643 } 644 645 // right shift and rounding 646 static INLINE void right_shift_8x8(__m128i *res, const int bit) { 647 __m128i sign0 = _mm_srai_epi16(res[0], 15); 648 __m128i sign1 = _mm_srai_epi16(res[1], 15); 649 __m128i sign2 = _mm_srai_epi16(res[2], 15); 650 __m128i sign3 = _mm_srai_epi16(res[3], 15); 651 __m128i sign4 = _mm_srai_epi16(res[4], 15); 652 __m128i sign5 = _mm_srai_epi16(res[5], 15); 653 __m128i sign6 = _mm_srai_epi16(res[6], 15); 654 __m128i sign7 = _mm_srai_epi16(res[7], 15); 655 656 if (bit == 2) { 657 const __m128i const_rounding = _mm_set1_epi16(1); 658 res[0] = _mm_add_epi16(res[0], const_rounding); 659 res[1] = _mm_add_epi16(res[1], const_rounding); 660 res[2] = _mm_add_epi16(res[2], const_rounding); 661 res[3] = _mm_add_epi16(res[3], const_rounding); 662 res[4] = _mm_add_epi16(res[4], const_rounding); 663 res[5] = _mm_add_epi16(res[5], const_rounding); 664 res[6] = _mm_add_epi16(res[6], const_rounding); 665 res[7] = _mm_add_epi16(res[7], const_rounding); 666 } 667 668 res[0] = _mm_sub_epi16(res[0], sign0); 669 res[1] = _mm_sub_epi16(res[1], sign1); 670 res[2] = _mm_sub_epi16(res[2], sign2); 671 res[3] = _mm_sub_epi16(res[3], sign3); 672 res[4] = _mm_sub_epi16(res[4], sign4); 673 res[5] = _mm_sub_epi16(res[5], sign5); 674 res[6] = _mm_sub_epi16(res[6], sign6); 675 res[7] = _mm_sub_epi16(res[7], sign7); 676 677 if (bit == 1) { 678 res[0] = _mm_srai_epi16(res[0], 1); 679 res[1] = _mm_srai_epi16(res[1], 1); 680 res[2] = _mm_srai_epi16(res[2], 1); 681 res[3] = _mm_srai_epi16(res[3], 1); 682 res[4] = _mm_srai_epi16(res[4], 1); 683 res[5] = _mm_srai_epi16(res[5], 1); 684 res[6] = _mm_srai_epi16(res[6], 1); 685 res[7] = _mm_srai_epi16(res[7], 1); 686 } else { 687 res[0] = _mm_srai_epi16(res[0], 2); 688 res[1] = _mm_srai_epi16(res[1], 2); 689 res[2] = _mm_srai_epi16(res[2], 2); 690 res[3] = _mm_srai_epi16(res[3], 2); 691 res[4] = _mm_srai_epi16(res[4], 2); 692 res[5] = _mm_srai_epi16(res[5], 2); 693 res[6] = _mm_srai_epi16(res[6], 2); 694 res[7] = _mm_srai_epi16(res[7], 2); 695 } 696 } 697 698 // write 8x8 array 699 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, 700 int stride) { 701 store_output(&res[0], (output + 0 * stride)); 702 store_output(&res[1], (output + 1 * stride)); 703 store_output(&res[2], (output + 2 * stride)); 704 store_output(&res[3], (output + 3 * stride)); 705 store_output(&res[4], (output + 4 * stride)); 706 store_output(&res[5], (output + 5 * stride)); 707 store_output(&res[6], (output + 6 * stride)); 708 store_output(&res[7], (output + 7 * stride)); 709 } 710 711 static void fdct8_sse2(__m128i *in) { 712 // constants 713 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 714 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 715 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 716 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 717 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 718 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 719 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 720 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 721 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 722 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 723 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 724 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 725 726 // stage 1 727 s0 = _mm_add_epi16(in[0], in[7]); 728 s1 = _mm_add_epi16(in[1], in[6]); 729 s2 = _mm_add_epi16(in[2], in[5]); 730 s3 = _mm_add_epi16(in[3], in[4]); 731 s4 = _mm_sub_epi16(in[3], in[4]); 732 s5 = _mm_sub_epi16(in[2], in[5]); 733 s6 = _mm_sub_epi16(in[1], in[6]); 734 s7 = _mm_sub_epi16(in[0], in[7]); 735 736 u0 = _mm_add_epi16(s0, s3); 737 u1 = _mm_add_epi16(s1, s2); 738 u2 = _mm_sub_epi16(s1, s2); 739 u3 = _mm_sub_epi16(s0, s3); 740 // interleave and perform butterfly multiplication/addition 741 v0 = _mm_unpacklo_epi16(u0, u1); 742 v1 = _mm_unpackhi_epi16(u0, u1); 743 v2 = _mm_unpacklo_epi16(u2, u3); 744 v3 = _mm_unpackhi_epi16(u2, u3); 745 746 u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 747 u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 748 u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 749 u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 750 u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 751 u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 752 u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 753 u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 754 755 // shift and rounding 756 v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 757 v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 758 v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 759 v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 760 v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 761 v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 762 v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 763 v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 764 765 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 766 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 767 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 768 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 769 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 770 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 771 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 772 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 773 774 in[0] = _mm_packs_epi32(u0, u1); 775 in[2] = _mm_packs_epi32(u4, u5); 776 in[4] = _mm_packs_epi32(u2, u3); 777 in[6] = _mm_packs_epi32(u6, u7); 778 779 // stage 2 780 // interleave and perform butterfly multiplication/addition 781 u0 = _mm_unpacklo_epi16(s6, s5); 782 u1 = _mm_unpackhi_epi16(s6, s5); 783 v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 784 v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 785 v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 786 v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 787 788 // shift and rounding 789 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 790 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 791 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 792 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 793 794 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 795 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 796 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 797 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 798 799 u0 = _mm_packs_epi32(v0, v1); 800 u1 = _mm_packs_epi32(v2, v3); 801 802 // stage 3 803 s0 = _mm_add_epi16(s4, u0); 804 s1 = _mm_sub_epi16(s4, u0); 805 s2 = _mm_sub_epi16(s7, u1); 806 s3 = _mm_add_epi16(s7, u1); 807 808 // stage 4 809 u0 = _mm_unpacklo_epi16(s0, s3); 810 u1 = _mm_unpackhi_epi16(s0, s3); 811 u2 = _mm_unpacklo_epi16(s1, s2); 812 u3 = _mm_unpackhi_epi16(s1, s2); 813 814 v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 815 v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 816 v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 817 v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 818 v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 819 v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 820 v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 821 v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 822 823 // shift and rounding 824 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 825 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 826 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 827 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 828 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 829 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 830 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 831 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 832 833 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 834 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 835 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 836 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 837 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 838 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 839 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 840 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 841 842 in[1] = _mm_packs_epi32(v0, v1); 843 in[3] = _mm_packs_epi32(v4, v5); 844 in[5] = _mm_packs_epi32(v2, v3); 845 in[7] = _mm_packs_epi32(v6, v7); 846 847 // transpose 848 transpose_16bit_8x8(in, in); 849 } 850 851 static void fadst8_sse2(__m128i *in) { 852 // Constants 853 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 854 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 855 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 856 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 857 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 858 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 859 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 860 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 861 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 862 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 863 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 864 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 865 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 866 const __m128i k__const_0 = _mm_set1_epi16(0); 867 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 868 869 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 870 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 871 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 872 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 873 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 874 875 // properly aligned for butterfly input 876 in0 = in[7]; 877 in1 = in[0]; 878 in2 = in[5]; 879 in3 = in[2]; 880 in4 = in[3]; 881 in5 = in[4]; 882 in6 = in[1]; 883 in7 = in[6]; 884 885 // column transformation 886 // stage 1 887 // interleave and multiply/add into 32-bit integer 888 s0 = _mm_unpacklo_epi16(in0, in1); 889 s1 = _mm_unpackhi_epi16(in0, in1); 890 s2 = _mm_unpacklo_epi16(in2, in3); 891 s3 = _mm_unpackhi_epi16(in2, in3); 892 s4 = _mm_unpacklo_epi16(in4, in5); 893 s5 = _mm_unpackhi_epi16(in4, in5); 894 s6 = _mm_unpacklo_epi16(in6, in7); 895 s7 = _mm_unpackhi_epi16(in6, in7); 896 897 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 898 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 899 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 900 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 901 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 902 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 903 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 904 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 905 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 906 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 907 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 908 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 909 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 910 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 911 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 912 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 913 914 // addition 915 w0 = _mm_add_epi32(u0, u8); 916 w1 = _mm_add_epi32(u1, u9); 917 w2 = _mm_add_epi32(u2, u10); 918 w3 = _mm_add_epi32(u3, u11); 919 w4 = _mm_add_epi32(u4, u12); 920 w5 = _mm_add_epi32(u5, u13); 921 w6 = _mm_add_epi32(u6, u14); 922 w7 = _mm_add_epi32(u7, u15); 923 w8 = _mm_sub_epi32(u0, u8); 924 w9 = _mm_sub_epi32(u1, u9); 925 w10 = _mm_sub_epi32(u2, u10); 926 w11 = _mm_sub_epi32(u3, u11); 927 w12 = _mm_sub_epi32(u4, u12); 928 w13 = _mm_sub_epi32(u5, u13); 929 w14 = _mm_sub_epi32(u6, u14); 930 w15 = _mm_sub_epi32(u7, u15); 931 932 // shift and rounding 933 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 934 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 935 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 936 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 937 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 938 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 939 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 940 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 941 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 942 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 943 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 944 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 945 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 946 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 947 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 948 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 949 950 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 951 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 952 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 953 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 954 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 955 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 956 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 957 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 958 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 959 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 960 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 961 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 962 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 963 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 964 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 965 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 966 967 // back to 16-bit and pack 8 integers into __m128i 968 in[0] = _mm_packs_epi32(u0, u1); 969 in[1] = _mm_packs_epi32(u2, u3); 970 in[2] = _mm_packs_epi32(u4, u5); 971 in[3] = _mm_packs_epi32(u6, u7); 972 in[4] = _mm_packs_epi32(u8, u9); 973 in[5] = _mm_packs_epi32(u10, u11); 974 in[6] = _mm_packs_epi32(u12, u13); 975 in[7] = _mm_packs_epi32(u14, u15); 976 977 // stage 2 978 s0 = _mm_add_epi16(in[0], in[2]); 979 s1 = _mm_add_epi16(in[1], in[3]); 980 s2 = _mm_sub_epi16(in[0], in[2]); 981 s3 = _mm_sub_epi16(in[1], in[3]); 982 u0 = _mm_unpacklo_epi16(in[4], in[5]); 983 u1 = _mm_unpackhi_epi16(in[4], in[5]); 984 u2 = _mm_unpacklo_epi16(in[6], in[7]); 985 u3 = _mm_unpackhi_epi16(in[6], in[7]); 986 987 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 988 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 989 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 990 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 991 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 992 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 993 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 994 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 995 996 w0 = _mm_add_epi32(v0, v4); 997 w1 = _mm_add_epi32(v1, v5); 998 w2 = _mm_add_epi32(v2, v6); 999 w3 = _mm_add_epi32(v3, v7); 1000 w4 = _mm_sub_epi32(v0, v4); 1001 w5 = _mm_sub_epi32(v1, v5); 1002 w6 = _mm_sub_epi32(v2, v6); 1003 w7 = _mm_sub_epi32(v3, v7); 1004 1005 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1006 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1007 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1008 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1009 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1010 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1011 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1012 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1013 1014 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1015 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1016 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1017 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1018 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1019 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1020 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1021 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1022 1023 // back to 16-bit intergers 1024 s4 = _mm_packs_epi32(u0, u1); 1025 s5 = _mm_packs_epi32(u2, u3); 1026 s6 = _mm_packs_epi32(u4, u5); 1027 s7 = _mm_packs_epi32(u6, u7); 1028 1029 // stage 3 1030 u0 = _mm_unpacklo_epi16(s2, s3); 1031 u1 = _mm_unpackhi_epi16(s2, s3); 1032 u2 = _mm_unpacklo_epi16(s6, s7); 1033 u3 = _mm_unpackhi_epi16(s6, s7); 1034 1035 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1036 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1037 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1038 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1039 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 1040 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 1041 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 1042 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 1043 1044 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1045 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1046 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1047 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1048 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1049 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1050 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1051 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1052 1053 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1054 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1055 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1056 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1057 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1058 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1059 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1060 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1061 1062 s2 = _mm_packs_epi32(v0, v1); 1063 s3 = _mm_packs_epi32(v2, v3); 1064 s6 = _mm_packs_epi32(v4, v5); 1065 s7 = _mm_packs_epi32(v6, v7); 1066 1067 // FIXME(jingning): do subtract using bit inversion? 1068 in[0] = s0; 1069 in[1] = _mm_sub_epi16(k__const_0, s4); 1070 in[2] = s6; 1071 in[3] = _mm_sub_epi16(k__const_0, s2); 1072 in[4] = s3; 1073 in[5] = _mm_sub_epi16(k__const_0, s7); 1074 in[6] = s5; 1075 in[7] = _mm_sub_epi16(k__const_0, s1); 1076 1077 // transpose 1078 transpose_16bit_8x8(in, in); 1079 } 1080 1081 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, 1082 int tx_type) { 1083 __m128i in[8]; 1084 1085 switch (tx_type) { 1086 case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break; 1087 case ADST_DCT: 1088 load_buffer_8x8(input, in, stride); 1089 fadst8_sse2(in); 1090 fdct8_sse2(in); 1091 right_shift_8x8(in, 1); 1092 write_buffer_8x8(output, in, 8); 1093 break; 1094 case DCT_ADST: 1095 load_buffer_8x8(input, in, stride); 1096 fdct8_sse2(in); 1097 fadst8_sse2(in); 1098 right_shift_8x8(in, 1); 1099 write_buffer_8x8(output, in, 8); 1100 break; 1101 default: 1102 assert(tx_type == ADST_ADST); 1103 load_buffer_8x8(input, in, stride); 1104 fadst8_sse2(in); 1105 fadst8_sse2(in); 1106 right_shift_8x8(in, 1); 1107 write_buffer_8x8(output, in, 8); 1108 break; 1109 } 1110 } 1111 1112 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, 1113 __m128i *in1, int stride) { 1114 // load first 8 columns 1115 load_buffer_8x8(input, in0, stride); 1116 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 1117 1118 input += 8; 1119 // load second 8 columns 1120 load_buffer_8x8(input, in1, stride); 1121 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 1122 } 1123 1124 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, 1125 __m128i *in1, int stride) { 1126 // write first 8 columns 1127 write_buffer_8x8(output, in0, stride); 1128 write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 1129 // write second 8 columns 1130 output += 8; 1131 write_buffer_8x8(output, in1, stride); 1132 write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 1133 } 1134 1135 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 1136 // perform rounding operations 1137 right_shift_8x8(res0, 2); 1138 right_shift_8x8(res0 + 8, 2); 1139 right_shift_8x8(res1, 2); 1140 right_shift_8x8(res1 + 8, 2); 1141 } 1142 1143 static void fdct16_8col(__m128i *in) { 1144 // perform 16x16 1-D DCT for 8 columns 1145 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1146 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1147 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1148 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1149 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1150 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); 1151 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1152 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1153 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1154 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1155 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1156 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1157 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1158 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1159 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1160 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1161 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1162 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1163 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1164 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1165 1166 // stage 1 1167 i[0] = _mm_add_epi16(in[0], in[15]); 1168 i[1] = _mm_add_epi16(in[1], in[14]); 1169 i[2] = _mm_add_epi16(in[2], in[13]); 1170 i[3] = _mm_add_epi16(in[3], in[12]); 1171 i[4] = _mm_add_epi16(in[4], in[11]); 1172 i[5] = _mm_add_epi16(in[5], in[10]); 1173 i[6] = _mm_add_epi16(in[6], in[9]); 1174 i[7] = _mm_add_epi16(in[7], in[8]); 1175 1176 s[0] = _mm_sub_epi16(in[7], in[8]); 1177 s[1] = _mm_sub_epi16(in[6], in[9]); 1178 s[2] = _mm_sub_epi16(in[5], in[10]); 1179 s[3] = _mm_sub_epi16(in[4], in[11]); 1180 s[4] = _mm_sub_epi16(in[3], in[12]); 1181 s[5] = _mm_sub_epi16(in[2], in[13]); 1182 s[6] = _mm_sub_epi16(in[1], in[14]); 1183 s[7] = _mm_sub_epi16(in[0], in[15]); 1184 1185 p[0] = _mm_add_epi16(i[0], i[7]); 1186 p[1] = _mm_add_epi16(i[1], i[6]); 1187 p[2] = _mm_add_epi16(i[2], i[5]); 1188 p[3] = _mm_add_epi16(i[3], i[4]); 1189 p[4] = _mm_sub_epi16(i[3], i[4]); 1190 p[5] = _mm_sub_epi16(i[2], i[5]); 1191 p[6] = _mm_sub_epi16(i[1], i[6]); 1192 p[7] = _mm_sub_epi16(i[0], i[7]); 1193 1194 u[0] = _mm_add_epi16(p[0], p[3]); 1195 u[1] = _mm_add_epi16(p[1], p[2]); 1196 u[2] = _mm_sub_epi16(p[1], p[2]); 1197 u[3] = _mm_sub_epi16(p[0], p[3]); 1198 1199 v[0] = _mm_unpacklo_epi16(u[0], u[1]); 1200 v[1] = _mm_unpackhi_epi16(u[0], u[1]); 1201 v[2] = _mm_unpacklo_epi16(u[2], u[3]); 1202 v[3] = _mm_unpackhi_epi16(u[2], u[3]); 1203 1204 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 1205 u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 1206 u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 1207 u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 1208 u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 1209 u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 1210 u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 1211 u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 1212 1213 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1214 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1215 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1216 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1217 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1218 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1219 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1220 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1221 1222 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1223 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1224 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1225 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1226 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1227 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1228 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1229 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1230 1231 in[0] = _mm_packs_epi32(u[0], u[1]); 1232 in[4] = _mm_packs_epi32(u[4], u[5]); 1233 in[8] = _mm_packs_epi32(u[2], u[3]); 1234 in[12] = _mm_packs_epi32(u[6], u[7]); 1235 1236 u[0] = _mm_unpacklo_epi16(p[5], p[6]); 1237 u[1] = _mm_unpackhi_epi16(p[5], p[6]); 1238 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1239 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1240 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1241 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1242 1243 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1244 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1245 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1246 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1247 1248 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1249 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1250 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1251 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1252 1253 u[0] = _mm_packs_epi32(v[0], v[1]); 1254 u[1] = _mm_packs_epi32(v[2], v[3]); 1255 1256 t[0] = _mm_add_epi16(p[4], u[0]); 1257 t[1] = _mm_sub_epi16(p[4], u[0]); 1258 t[2] = _mm_sub_epi16(p[7], u[1]); 1259 t[3] = _mm_add_epi16(p[7], u[1]); 1260 1261 u[0] = _mm_unpacklo_epi16(t[0], t[3]); 1262 u[1] = _mm_unpackhi_epi16(t[0], t[3]); 1263 u[2] = _mm_unpacklo_epi16(t[1], t[2]); 1264 u[3] = _mm_unpackhi_epi16(t[1], t[2]); 1265 1266 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 1267 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 1268 v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 1269 v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 1270 v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 1271 v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 1272 v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 1273 v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 1274 1275 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1276 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1277 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1278 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1279 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1280 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1281 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1282 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1283 1284 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1285 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1286 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1287 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1288 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1289 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1290 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1291 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1292 1293 in[2] = _mm_packs_epi32(v[0], v[1]); 1294 in[6] = _mm_packs_epi32(v[4], v[5]); 1295 in[10] = _mm_packs_epi32(v[2], v[3]); 1296 in[14] = _mm_packs_epi32(v[6], v[7]); 1297 1298 // stage 2 1299 u[0] = _mm_unpacklo_epi16(s[2], s[5]); 1300 u[1] = _mm_unpackhi_epi16(s[2], s[5]); 1301 u[2] = _mm_unpacklo_epi16(s[3], s[4]); 1302 u[3] = _mm_unpackhi_epi16(s[3], s[4]); 1303 1304 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1305 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1306 v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1307 v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1308 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1309 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1310 v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1311 v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1312 1313 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1314 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1315 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1316 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1317 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1318 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1319 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1320 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1321 1322 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1323 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1324 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1325 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1326 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1327 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1328 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1329 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1330 1331 t[2] = _mm_packs_epi32(v[0], v[1]); 1332 t[3] = _mm_packs_epi32(v[2], v[3]); 1333 t[4] = _mm_packs_epi32(v[4], v[5]); 1334 t[5] = _mm_packs_epi32(v[6], v[7]); 1335 1336 // stage 3 1337 p[0] = _mm_add_epi16(s[0], t[3]); 1338 p[1] = _mm_add_epi16(s[1], t[2]); 1339 p[2] = _mm_sub_epi16(s[1], t[2]); 1340 p[3] = _mm_sub_epi16(s[0], t[3]); 1341 p[4] = _mm_sub_epi16(s[7], t[4]); 1342 p[5] = _mm_sub_epi16(s[6], t[5]); 1343 p[6] = _mm_add_epi16(s[6], t[5]); 1344 p[7] = _mm_add_epi16(s[7], t[4]); 1345 1346 // stage 4 1347 u[0] = _mm_unpacklo_epi16(p[1], p[6]); 1348 u[1] = _mm_unpackhi_epi16(p[1], p[6]); 1349 u[2] = _mm_unpacklo_epi16(p[2], p[5]); 1350 u[3] = _mm_unpackhi_epi16(p[2], p[5]); 1351 1352 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 1353 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 1354 v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); 1355 v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); 1356 v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); 1357 v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); 1358 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 1359 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 1360 1361 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1362 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1363 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1364 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1365 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1366 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1367 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1368 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1369 1370 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1371 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1372 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1373 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1374 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1375 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1376 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1377 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1378 1379 t[1] = _mm_packs_epi32(v[0], v[1]); 1380 t[2] = _mm_packs_epi32(v[2], v[3]); 1381 t[5] = _mm_packs_epi32(v[4], v[5]); 1382 t[6] = _mm_packs_epi32(v[6], v[7]); 1383 1384 // stage 5 1385 s[0] = _mm_add_epi16(p[0], t[1]); 1386 s[1] = _mm_sub_epi16(p[0], t[1]); 1387 s[2] = _mm_add_epi16(p[3], t[2]); 1388 s[3] = _mm_sub_epi16(p[3], t[2]); 1389 s[4] = _mm_sub_epi16(p[4], t[5]); 1390 s[5] = _mm_add_epi16(p[4], t[5]); 1391 s[6] = _mm_sub_epi16(p[7], t[6]); 1392 s[7] = _mm_add_epi16(p[7], t[6]); 1393 1394 // stage 6 1395 u[0] = _mm_unpacklo_epi16(s[0], s[7]); 1396 u[1] = _mm_unpackhi_epi16(s[0], s[7]); 1397 u[2] = _mm_unpacklo_epi16(s[1], s[6]); 1398 u[3] = _mm_unpackhi_epi16(s[1], s[6]); 1399 u[4] = _mm_unpacklo_epi16(s[2], s[5]); 1400 u[5] = _mm_unpackhi_epi16(s[2], s[5]); 1401 u[6] = _mm_unpacklo_epi16(s[3], s[4]); 1402 u[7] = _mm_unpackhi_epi16(s[3], s[4]); 1403 1404 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 1405 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 1406 v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 1407 v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 1408 v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 1409 v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 1410 v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 1411 v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 1412 v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 1413 v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 1414 v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 1415 v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 1416 v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 1417 v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 1418 v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 1419 v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 1420 1421 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1422 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1423 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1424 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1425 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1426 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1427 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1428 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1429 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1430 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1431 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1432 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1433 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1434 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1435 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1436 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1437 1438 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1439 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1440 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1441 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1442 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1443 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1444 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1445 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1446 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1447 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1448 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1449 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1450 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1451 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1452 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1453 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1454 1455 in[1] = _mm_packs_epi32(v[0], v[1]); 1456 in[9] = _mm_packs_epi32(v[2], v[3]); 1457 in[5] = _mm_packs_epi32(v[4], v[5]); 1458 in[13] = _mm_packs_epi32(v[6], v[7]); 1459 in[3] = _mm_packs_epi32(v[8], v[9]); 1460 in[11] = _mm_packs_epi32(v[10], v[11]); 1461 in[7] = _mm_packs_epi32(v[12], v[13]); 1462 in[15] = _mm_packs_epi32(v[14], v[15]); 1463 } 1464 1465 static void fadst16_8col(__m128i *in) { 1466 // perform 16x16 1-D ADST for 8 columns 1467 __m128i s[16], x[16], u[32], v[32]; 1468 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1469 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1470 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1471 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1472 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1473 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1474 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1475 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1476 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1477 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1478 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1479 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1480 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1481 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1482 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1483 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1484 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1485 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1486 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1487 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1488 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1489 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1490 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1491 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1492 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1493 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1494 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1495 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1496 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1497 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1498 const __m128i kZero = _mm_set1_epi16(0); 1499 1500 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1501 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1502 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1503 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1504 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1505 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1506 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1507 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1508 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1509 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1510 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1511 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1512 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1513 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1514 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1515 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1516 1517 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1518 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1519 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1520 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1521 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1522 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1523 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1524 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1525 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1526 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1527 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1528 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1529 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1530 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1531 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1532 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1533 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1534 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1535 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1536 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1537 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1538 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1539 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1540 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1541 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1542 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1543 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1544 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1545 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1546 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1547 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1548 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1549 1550 u[0] = _mm_add_epi32(v[0], v[16]); 1551 u[1] = _mm_add_epi32(v[1], v[17]); 1552 u[2] = _mm_add_epi32(v[2], v[18]); 1553 u[3] = _mm_add_epi32(v[3], v[19]); 1554 u[4] = _mm_add_epi32(v[4], v[20]); 1555 u[5] = _mm_add_epi32(v[5], v[21]); 1556 u[6] = _mm_add_epi32(v[6], v[22]); 1557 u[7] = _mm_add_epi32(v[7], v[23]); 1558 u[8] = _mm_add_epi32(v[8], v[24]); 1559 u[9] = _mm_add_epi32(v[9], v[25]); 1560 u[10] = _mm_add_epi32(v[10], v[26]); 1561 u[11] = _mm_add_epi32(v[11], v[27]); 1562 u[12] = _mm_add_epi32(v[12], v[28]); 1563 u[13] = _mm_add_epi32(v[13], v[29]); 1564 u[14] = _mm_add_epi32(v[14], v[30]); 1565 u[15] = _mm_add_epi32(v[15], v[31]); 1566 u[16] = _mm_sub_epi32(v[0], v[16]); 1567 u[17] = _mm_sub_epi32(v[1], v[17]); 1568 u[18] = _mm_sub_epi32(v[2], v[18]); 1569 u[19] = _mm_sub_epi32(v[3], v[19]); 1570 u[20] = _mm_sub_epi32(v[4], v[20]); 1571 u[21] = _mm_sub_epi32(v[5], v[21]); 1572 u[22] = _mm_sub_epi32(v[6], v[22]); 1573 u[23] = _mm_sub_epi32(v[7], v[23]); 1574 u[24] = _mm_sub_epi32(v[8], v[24]); 1575 u[25] = _mm_sub_epi32(v[9], v[25]); 1576 u[26] = _mm_sub_epi32(v[10], v[26]); 1577 u[27] = _mm_sub_epi32(v[11], v[27]); 1578 u[28] = _mm_sub_epi32(v[12], v[28]); 1579 u[29] = _mm_sub_epi32(v[13], v[29]); 1580 u[30] = _mm_sub_epi32(v[14], v[30]); 1581 u[31] = _mm_sub_epi32(v[15], v[31]); 1582 1583 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1584 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1585 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1586 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1587 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1588 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1589 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1590 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1591 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1592 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1593 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1594 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1595 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1596 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1597 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1598 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1599 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1600 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1601 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1602 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1603 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1604 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1605 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1606 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1607 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1608 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1609 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1610 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1611 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1612 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1613 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1614 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1615 1616 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1617 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1618 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1619 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1620 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1621 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1622 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1623 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1624 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1625 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1626 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1627 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1628 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1629 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1630 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1631 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1632 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1633 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1634 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1635 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1636 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1637 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1638 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1639 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1640 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1641 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1642 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1643 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1644 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1645 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1646 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1647 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1648 1649 s[0] = _mm_packs_epi32(u[0], u[1]); 1650 s[1] = _mm_packs_epi32(u[2], u[3]); 1651 s[2] = _mm_packs_epi32(u[4], u[5]); 1652 s[3] = _mm_packs_epi32(u[6], u[7]); 1653 s[4] = _mm_packs_epi32(u[8], u[9]); 1654 s[5] = _mm_packs_epi32(u[10], u[11]); 1655 s[6] = _mm_packs_epi32(u[12], u[13]); 1656 s[7] = _mm_packs_epi32(u[14], u[15]); 1657 s[8] = _mm_packs_epi32(u[16], u[17]); 1658 s[9] = _mm_packs_epi32(u[18], u[19]); 1659 s[10] = _mm_packs_epi32(u[20], u[21]); 1660 s[11] = _mm_packs_epi32(u[22], u[23]); 1661 s[12] = _mm_packs_epi32(u[24], u[25]); 1662 s[13] = _mm_packs_epi32(u[26], u[27]); 1663 s[14] = _mm_packs_epi32(u[28], u[29]); 1664 s[15] = _mm_packs_epi32(u[30], u[31]); 1665 1666 // stage 2 1667 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1668 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1669 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1670 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1671 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1672 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1673 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1674 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1675 1676 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1677 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1678 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1679 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1680 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1681 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1682 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1683 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1684 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1685 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1686 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1687 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1688 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1689 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1690 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1691 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1692 1693 u[0] = _mm_add_epi32(v[0], v[8]); 1694 u[1] = _mm_add_epi32(v[1], v[9]); 1695 u[2] = _mm_add_epi32(v[2], v[10]); 1696 u[3] = _mm_add_epi32(v[3], v[11]); 1697 u[4] = _mm_add_epi32(v[4], v[12]); 1698 u[5] = _mm_add_epi32(v[5], v[13]); 1699 u[6] = _mm_add_epi32(v[6], v[14]); 1700 u[7] = _mm_add_epi32(v[7], v[15]); 1701 u[8] = _mm_sub_epi32(v[0], v[8]); 1702 u[9] = _mm_sub_epi32(v[1], v[9]); 1703 u[10] = _mm_sub_epi32(v[2], v[10]); 1704 u[11] = _mm_sub_epi32(v[3], v[11]); 1705 u[12] = _mm_sub_epi32(v[4], v[12]); 1706 u[13] = _mm_sub_epi32(v[5], v[13]); 1707 u[14] = _mm_sub_epi32(v[6], v[14]); 1708 u[15] = _mm_sub_epi32(v[7], v[15]); 1709 1710 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1711 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1712 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1713 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1714 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1715 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1716 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1717 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1718 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1719 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1720 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1721 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1722 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1723 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1724 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1725 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1726 1727 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1728 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1729 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1730 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1731 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1732 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1733 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1734 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1735 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1736 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1737 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1738 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1739 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1740 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1741 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1742 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1743 1744 x[0] = _mm_add_epi16(s[0], s[4]); 1745 x[1] = _mm_add_epi16(s[1], s[5]); 1746 x[2] = _mm_add_epi16(s[2], s[6]); 1747 x[3] = _mm_add_epi16(s[3], s[7]); 1748 x[4] = _mm_sub_epi16(s[0], s[4]); 1749 x[5] = _mm_sub_epi16(s[1], s[5]); 1750 x[6] = _mm_sub_epi16(s[2], s[6]); 1751 x[7] = _mm_sub_epi16(s[3], s[7]); 1752 x[8] = _mm_packs_epi32(u[0], u[1]); 1753 x[9] = _mm_packs_epi32(u[2], u[3]); 1754 x[10] = _mm_packs_epi32(u[4], u[5]); 1755 x[11] = _mm_packs_epi32(u[6], u[7]); 1756 x[12] = _mm_packs_epi32(u[8], u[9]); 1757 x[13] = _mm_packs_epi32(u[10], u[11]); 1758 x[14] = _mm_packs_epi32(u[12], u[13]); 1759 x[15] = _mm_packs_epi32(u[14], u[15]); 1760 1761 // stage 3 1762 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1763 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1764 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1765 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1766 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1767 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1768 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1769 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1770 1771 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1772 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1773 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1774 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1775 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1776 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1777 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1778 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1779 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1780 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1781 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1782 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1783 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1784 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1785 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1786 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1787 1788 u[0] = _mm_add_epi32(v[0], v[4]); 1789 u[1] = _mm_add_epi32(v[1], v[5]); 1790 u[2] = _mm_add_epi32(v[2], v[6]); 1791 u[3] = _mm_add_epi32(v[3], v[7]); 1792 u[4] = _mm_sub_epi32(v[0], v[4]); 1793 u[5] = _mm_sub_epi32(v[1], v[5]); 1794 u[6] = _mm_sub_epi32(v[2], v[6]); 1795 u[7] = _mm_sub_epi32(v[3], v[7]); 1796 u[8] = _mm_add_epi32(v[8], v[12]); 1797 u[9] = _mm_add_epi32(v[9], v[13]); 1798 u[10] = _mm_add_epi32(v[10], v[14]); 1799 u[11] = _mm_add_epi32(v[11], v[15]); 1800 u[12] = _mm_sub_epi32(v[8], v[12]); 1801 u[13] = _mm_sub_epi32(v[9], v[13]); 1802 u[14] = _mm_sub_epi32(v[10], v[14]); 1803 u[15] = _mm_sub_epi32(v[11], v[15]); 1804 1805 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1806 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1807 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1808 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1809 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1810 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1811 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1812 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1813 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1814 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1815 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1816 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1817 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1818 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1819 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1820 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1821 1822 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1823 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1824 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1825 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1826 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1827 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1828 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1829 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1830 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1831 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1832 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1833 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1834 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1835 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1836 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1837 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1838 1839 s[0] = _mm_add_epi16(x[0], x[2]); 1840 s[1] = _mm_add_epi16(x[1], x[3]); 1841 s[2] = _mm_sub_epi16(x[0], x[2]); 1842 s[3] = _mm_sub_epi16(x[1], x[3]); 1843 s[4] = _mm_packs_epi32(v[0], v[1]); 1844 s[5] = _mm_packs_epi32(v[2], v[3]); 1845 s[6] = _mm_packs_epi32(v[4], v[5]); 1846 s[7] = _mm_packs_epi32(v[6], v[7]); 1847 s[8] = _mm_add_epi16(x[8], x[10]); 1848 s[9] = _mm_add_epi16(x[9], x[11]); 1849 s[10] = _mm_sub_epi16(x[8], x[10]); 1850 s[11] = _mm_sub_epi16(x[9], x[11]); 1851 s[12] = _mm_packs_epi32(v[8], v[9]); 1852 s[13] = _mm_packs_epi32(v[10], v[11]); 1853 s[14] = _mm_packs_epi32(v[12], v[13]); 1854 s[15] = _mm_packs_epi32(v[14], v[15]); 1855 1856 // stage 4 1857 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1858 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1859 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1860 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1861 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1862 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1863 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1864 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1865 1866 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1867 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1868 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1869 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1870 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1871 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1872 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1873 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1874 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1875 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1876 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1877 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1878 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1879 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1880 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1881 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1882 1883 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1884 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1885 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1886 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1887 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1888 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1889 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1890 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1891 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1892 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1893 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1894 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1895 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1896 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1897 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1898 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1899 1900 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1901 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1902 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1903 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1904 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1905 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1906 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1907 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1908 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1909 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1910 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1911 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1912 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1913 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1914 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1915 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1916 1917 in[0] = s[0]; 1918 in[1] = _mm_sub_epi16(kZero, s[8]); 1919 in[2] = s[12]; 1920 in[3] = _mm_sub_epi16(kZero, s[4]); 1921 in[4] = _mm_packs_epi32(v[4], v[5]); 1922 in[5] = _mm_packs_epi32(v[12], v[13]); 1923 in[6] = _mm_packs_epi32(v[8], v[9]); 1924 in[7] = _mm_packs_epi32(v[0], v[1]); 1925 in[8] = _mm_packs_epi32(v[2], v[3]); 1926 in[9] = _mm_packs_epi32(v[10], v[11]); 1927 in[10] = _mm_packs_epi32(v[14], v[15]); 1928 in[11] = _mm_packs_epi32(v[6], v[7]); 1929 in[12] = s[5]; 1930 in[13] = _mm_sub_epi16(kZero, s[13]); 1931 in[14] = s[9]; 1932 in[15] = _mm_sub_epi16(kZero, s[1]); 1933 } 1934 1935 static void fdct16_sse2(__m128i *in0, __m128i *in1) { 1936 fdct16_8col(in0); 1937 fdct16_8col(in1); 1938 transpose_16bit_16x16(in0, in1); 1939 } 1940 1941 static void fadst16_sse2(__m128i *in0, __m128i *in1) { 1942 fadst16_8col(in0); 1943 fadst16_8col(in1); 1944 transpose_16bit_16x16(in0, in1); 1945 } 1946 1947 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, 1948 int tx_type) { 1949 __m128i in0[16], in1[16]; 1950 1951 switch (tx_type) { 1952 case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break; 1953 case ADST_DCT: 1954 load_buffer_16x16(input, in0, in1, stride); 1955 fadst16_sse2(in0, in1); 1956 right_shift_16x16(in0, in1); 1957 fdct16_sse2(in0, in1); 1958 write_buffer_16x16(output, in0, in1, 16); 1959 break; 1960 case DCT_ADST: 1961 load_buffer_16x16(input, in0, in1, stride); 1962 fdct16_sse2(in0, in1); 1963 right_shift_16x16(in0, in1); 1964 fadst16_sse2(in0, in1); 1965 write_buffer_16x16(output, in0, in1, 16); 1966 break; 1967 default: 1968 assert(tx_type == ADST_ADST); 1969 load_buffer_16x16(input, in0, in1, stride); 1970 fadst16_sse2(in0, in1); 1971 right_shift_16x16(in0, in1); 1972 fadst16_sse2(in0, in1); 1973 write_buffer_16x16(output, in0, in1, 16); 1974 break; 1975 } 1976 } 1977