1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <emmintrin.h> // SSE2 13 14 #include "./vp9_rtcd.h" 15 #include "./vpx_dsp_rtcd.h" 16 #include "vpx_dsp/txfm_common.h" 17 #include "vpx_dsp/x86/fwd_txfm_sse2.h" 18 #include "vpx_dsp/x86/transpose_sse2.h" 19 #include "vpx_dsp/x86/txfm_common_sse2.h" 20 #include "vpx_ports/mem.h" 21 22 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 23 int stride) { 24 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 25 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 26 __m128i mask; 27 28 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 29 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 30 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 31 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 32 33 in[0] = _mm_slli_epi16(in[0], 4); 34 in[1] = _mm_slli_epi16(in[1], 4); 35 in[2] = _mm_slli_epi16(in[2], 4); 36 in[3] = _mm_slli_epi16(in[3], 4); 37 38 mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 39 in[0] = _mm_add_epi16(in[0], mask); 40 in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 41 } 42 43 static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { 44 const __m128i kOne = _mm_set1_epi16(1); 45 __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 46 __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 47 __m128i out01 = _mm_add_epi16(in01, kOne); 48 __m128i out23 = _mm_add_epi16(in23, kOne); 49 out01 = _mm_srai_epi16(out01, 2); 50 out23 = _mm_srai_epi16(out23, 2); 51 store_output(&out01, (output + 0 * 8)); 52 store_output(&out23, (output + 1 * 8)); 53 } 54 55 static INLINE void transpose_4x4(__m128i *res) { 56 // Combine and transpose 57 // 00 01 02 03 20 21 22 23 58 // 10 11 12 13 30 31 32 33 59 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 60 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 61 62 // 00 10 01 11 02 12 03 13 63 // 20 30 21 31 22 32 23 33 64 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 65 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 66 67 // 00 10 20 30 01 11 21 31 68 // 02 12 22 32 03 13 23 33 69 // only use the first 4 16-bit integers 70 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 71 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 72 } 73 74 static void fdct4_sse2(__m128i *in) { 75 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 76 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 77 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 78 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 79 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 80 81 __m128i u[4], v[4]; 82 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 83 u[1] = _mm_unpacklo_epi16(in[3], in[2]); 84 85 v[0] = _mm_add_epi16(u[0], u[1]); 86 v[1] = _mm_sub_epi16(u[0], u[1]); 87 88 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 89 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 90 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 91 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 92 93 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 94 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 95 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 96 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 97 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 98 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 99 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 100 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 101 102 in[0] = _mm_packs_epi32(u[0], u[1]); 103 in[1] = _mm_packs_epi32(u[2], u[3]); 104 transpose_4x4(in); 105 } 106 107 static void fadst4_sse2(__m128i *in) { 108 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 109 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 110 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 111 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 112 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 113 const __m128i kZero = _mm_set1_epi16(0); 114 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 115 __m128i u[8], v[8]; 116 __m128i in7 = _mm_add_epi16(in[0], in[1]); 117 118 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 119 u[1] = _mm_unpacklo_epi16(in[2], in[3]); 120 u[2] = _mm_unpacklo_epi16(in7, kZero); 121 u[3] = _mm_unpacklo_epi16(in[2], kZero); 122 u[4] = _mm_unpacklo_epi16(in[3], kZero); 123 124 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 125 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 126 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 127 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 128 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 129 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 130 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 131 132 u[0] = _mm_add_epi32(v[0], v[1]); 133 u[1] = _mm_sub_epi32(v[2], v[6]); 134 u[2] = _mm_add_epi32(v[3], v[4]); 135 u[3] = _mm_sub_epi32(u[2], u[0]); 136 u[4] = _mm_slli_epi32(v[5], 2); 137 u[5] = _mm_sub_epi32(u[4], v[5]); 138 u[6] = _mm_add_epi32(u[3], u[5]); 139 140 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 141 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 142 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 143 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 144 145 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 146 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 147 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 148 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 149 150 in[0] = _mm_packs_epi32(u[0], u[2]); 151 in[1] = _mm_packs_epi32(u[1], u[3]); 152 transpose_4x4(in); 153 } 154 155 void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, 156 int tx_type) { 157 __m128i in[4]; 158 159 switch (tx_type) { 160 case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break; 161 case ADST_DCT: 162 load_buffer_4x4(input, in, stride); 163 fadst4_sse2(in); 164 fdct4_sse2(in); 165 write_buffer_4x4(output, in); 166 break; 167 case DCT_ADST: 168 load_buffer_4x4(input, in, stride); 169 fdct4_sse2(in); 170 fadst4_sse2(in); 171 write_buffer_4x4(output, in); 172 break; 173 case ADST_ADST: 174 load_buffer_4x4(input, in, stride); 175 fadst4_sse2(in); 176 fadst4_sse2(in); 177 write_buffer_4x4(output, in); 178 break; 179 default: assert(0); break; 180 } 181 } 182 183 void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, 184 int16_t *coeff_ptr, intptr_t n_coeffs, 185 int skip_block, const int16_t *round_ptr, 186 const int16_t *quant_ptr, int16_t *qcoeff_ptr, 187 int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, 188 uint16_t *eob_ptr, const int16_t *scan_ptr, 189 const int16_t *iscan_ptr) { 190 __m128i zero; 191 int pass; 192 193 // Constants 194 // When we use them, in one case, they are all the same. In all others 195 // it's a pair of them that we need to repeat four times. This is done 196 // by constructing the 32 bit constant corresponding to that pair. 197 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 198 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 199 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 200 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 201 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 202 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 203 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 204 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 205 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 206 // Load input 207 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 208 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 209 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 210 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 211 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 212 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 213 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 214 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 215 __m128i *in[8]; 216 int index = 0; 217 218 (void)scan_ptr; 219 (void)coeff_ptr; 220 221 // Pre-condition input (shift by two) 222 in0 = _mm_slli_epi16(in0, 2); 223 in1 = _mm_slli_epi16(in1, 2); 224 in2 = _mm_slli_epi16(in2, 2); 225 in3 = _mm_slli_epi16(in3, 2); 226 in4 = _mm_slli_epi16(in4, 2); 227 in5 = _mm_slli_epi16(in5, 2); 228 in6 = _mm_slli_epi16(in6, 2); 229 in7 = _mm_slli_epi16(in7, 2); 230 231 in[0] = &in0; 232 in[1] = &in1; 233 in[2] = &in2; 234 in[3] = &in3; 235 in[4] = &in4; 236 in[5] = &in5; 237 in[6] = &in6; 238 in[7] = &in7; 239 240 // We do two passes, first the columns, then the rows. The results of the 241 // first pass are transposed so that the same column code can be reused. The 242 // results of the second pass are also transposed so that the rows (processed 243 // as columns) are put back in row positions. 244 for (pass = 0; pass < 2; pass++) { 245 // To store results of each pass before the transpose. 246 __m128i res0, res1, res2, res3, res4, res5, res6, res7; 247 // Add/subtract 248 const __m128i q0 = _mm_add_epi16(in0, in7); 249 const __m128i q1 = _mm_add_epi16(in1, in6); 250 const __m128i q2 = _mm_add_epi16(in2, in5); 251 const __m128i q3 = _mm_add_epi16(in3, in4); 252 const __m128i q4 = _mm_sub_epi16(in3, in4); 253 const __m128i q5 = _mm_sub_epi16(in2, in5); 254 const __m128i q6 = _mm_sub_epi16(in1, in6); 255 const __m128i q7 = _mm_sub_epi16(in0, in7); 256 // Work on first four results 257 { 258 // Add/subtract 259 const __m128i r0 = _mm_add_epi16(q0, q3); 260 const __m128i r1 = _mm_add_epi16(q1, q2); 261 const __m128i r2 = _mm_sub_epi16(q1, q2); 262 const __m128i r3 = _mm_sub_epi16(q0, q3); 263 // Interleave to do the multiply by constants which gets us into 32bits 264 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 265 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 266 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 267 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 268 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 269 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 270 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 271 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 272 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 273 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 274 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 275 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 276 // dct_const_round_shift 277 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 278 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 279 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 280 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 281 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 282 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 283 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 284 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 285 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 286 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 287 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 288 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 289 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 290 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 291 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 292 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 293 // Combine 294 res0 = _mm_packs_epi32(w0, w1); 295 res4 = _mm_packs_epi32(w2, w3); 296 res2 = _mm_packs_epi32(w4, w5); 297 res6 = _mm_packs_epi32(w6, w7); 298 } 299 // Work on next four results 300 { 301 // Interleave to do the multiply by constants which gets us into 32bits 302 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 303 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 304 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 305 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 306 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 307 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 308 // dct_const_round_shift 309 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 310 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 311 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 312 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 313 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 314 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 315 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 316 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 317 // Combine 318 const __m128i r0 = _mm_packs_epi32(s0, s1); 319 const __m128i r1 = _mm_packs_epi32(s2, s3); 320 // Add/subtract 321 const __m128i x0 = _mm_add_epi16(q4, r0); 322 const __m128i x1 = _mm_sub_epi16(q4, r0); 323 const __m128i x2 = _mm_sub_epi16(q7, r1); 324 const __m128i x3 = _mm_add_epi16(q7, r1); 325 // Interleave to do the multiply by constants which gets us into 32bits 326 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 327 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 328 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 329 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 330 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 331 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 332 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 333 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 334 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 335 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 336 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 337 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 338 // dct_const_round_shift 339 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 340 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 341 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 342 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 343 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 344 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 345 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 346 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 347 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 348 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 349 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 350 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 351 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 352 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 353 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 354 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 355 // Combine 356 res1 = _mm_packs_epi32(w0, w1); 357 res7 = _mm_packs_epi32(w2, w3); 358 res5 = _mm_packs_epi32(w4, w5); 359 res3 = _mm_packs_epi32(w6, w7); 360 } 361 // Transpose the 8x8. 362 { 363 // 00 01 02 03 04 05 06 07 364 // 10 11 12 13 14 15 16 17 365 // 20 21 22 23 24 25 26 27 366 // 30 31 32 33 34 35 36 37 367 // 40 41 42 43 44 45 46 47 368 // 50 51 52 53 54 55 56 57 369 // 60 61 62 63 64 65 66 67 370 // 70 71 72 73 74 75 76 77 371 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 372 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 373 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 374 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 375 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 376 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 377 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 378 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 379 // 00 10 01 11 02 12 03 13 380 // 20 30 21 31 22 32 23 33 381 // 04 14 05 15 06 16 07 17 382 // 24 34 25 35 26 36 27 37 383 // 40 50 41 51 42 52 43 53 384 // 60 70 61 71 62 72 63 73 385 // 54 54 55 55 56 56 57 57 386 // 64 74 65 75 66 76 67 77 387 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 388 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 389 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 390 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 391 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 392 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 393 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 394 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 395 // 00 10 20 30 01 11 21 31 396 // 40 50 60 70 41 51 61 71 397 // 02 12 22 32 03 13 23 33 398 // 42 52 62 72 43 53 63 73 399 // 04 14 24 34 05 15 21 36 400 // 44 54 64 74 45 55 61 76 401 // 06 16 26 36 07 17 27 37 402 // 46 56 66 76 47 57 67 77 403 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 404 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 405 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 406 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 407 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 408 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 409 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 410 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 411 // 00 10 20 30 40 50 60 70 412 // 01 11 21 31 41 51 61 71 413 // 02 12 22 32 42 52 62 72 414 // 03 13 23 33 43 53 63 73 415 // 04 14 24 34 44 54 64 74 416 // 05 15 25 35 45 55 65 75 417 // 06 16 26 36 46 56 66 76 418 // 07 17 27 37 47 57 67 77 419 } 420 } 421 // Post-condition output and store it 422 { 423 // Post-condition (division by two) 424 // division of two 16 bits signed numbers using shifts 425 // n / 2 = (n - (n >> 15)) >> 1 426 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 427 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 428 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 429 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 430 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 431 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 432 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 433 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 434 in0 = _mm_sub_epi16(in0, sign_in0); 435 in1 = _mm_sub_epi16(in1, sign_in1); 436 in2 = _mm_sub_epi16(in2, sign_in2); 437 in3 = _mm_sub_epi16(in3, sign_in3); 438 in4 = _mm_sub_epi16(in4, sign_in4); 439 in5 = _mm_sub_epi16(in5, sign_in5); 440 in6 = _mm_sub_epi16(in6, sign_in6); 441 in7 = _mm_sub_epi16(in7, sign_in7); 442 in0 = _mm_srai_epi16(in0, 1); 443 in1 = _mm_srai_epi16(in1, 1); 444 in2 = _mm_srai_epi16(in2, 1); 445 in3 = _mm_srai_epi16(in3, 1); 446 in4 = _mm_srai_epi16(in4, 1); 447 in5 = _mm_srai_epi16(in5, 1); 448 in6 = _mm_srai_epi16(in6, 1); 449 in7 = _mm_srai_epi16(in7, 1); 450 } 451 452 iscan_ptr += n_coeffs; 453 qcoeff_ptr += n_coeffs; 454 dqcoeff_ptr += n_coeffs; 455 n_coeffs = -n_coeffs; 456 zero = _mm_setzero_si128(); 457 458 if (!skip_block) { 459 __m128i eob; 460 __m128i round, quant, dequant; 461 { 462 __m128i coeff0, coeff1; 463 464 // Setup global values 465 { 466 round = _mm_load_si128((const __m128i *)round_ptr); 467 quant = _mm_load_si128((const __m128i *)quant_ptr); 468 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 469 } 470 471 { 472 __m128i coeff0_sign, coeff1_sign; 473 __m128i qcoeff0, qcoeff1; 474 __m128i qtmp0, qtmp1; 475 // Do DC and first 15 AC 476 coeff0 = *in[0]; 477 coeff1 = *in[1]; 478 479 // Poor man's sign extract 480 coeff0_sign = _mm_srai_epi16(coeff0, 15); 481 coeff1_sign = _mm_srai_epi16(coeff1, 15); 482 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 483 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 484 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 485 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 486 487 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 488 round = _mm_unpackhi_epi64(round, round); 489 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 490 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 491 quant = _mm_unpackhi_epi64(quant, quant); 492 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 493 494 // Reinsert signs 495 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 496 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 497 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 498 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 499 500 _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); 501 _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 502 503 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 504 dequant = _mm_unpackhi_epi64(dequant, dequant); 505 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 506 507 _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); 508 _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 509 } 510 511 { 512 // Scan for eob 513 __m128i zero_coeff0, zero_coeff1; 514 __m128i nzero_coeff0, nzero_coeff1; 515 __m128i iscan0, iscan1; 516 __m128i eob1; 517 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 518 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 519 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 520 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 521 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 522 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 523 // Add one to convert from indices to counts 524 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 525 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 526 eob = _mm_and_si128(iscan0, nzero_coeff0); 527 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 528 eob = _mm_max_epi16(eob, eob1); 529 } 530 n_coeffs += 8 * 2; 531 } 532 533 // AC only loop 534 index = 2; 535 while (n_coeffs < 0) { 536 __m128i coeff0, coeff1; 537 { 538 __m128i coeff0_sign, coeff1_sign; 539 __m128i qcoeff0, qcoeff1; 540 __m128i qtmp0, qtmp1; 541 542 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); 543 coeff0 = *in[index]; 544 coeff1 = *in[index + 1]; 545 546 // Poor man's sign extract 547 coeff0_sign = _mm_srai_epi16(coeff0, 15); 548 coeff1_sign = _mm_srai_epi16(coeff1, 15); 549 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 550 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 551 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 552 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 553 554 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 555 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 556 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 557 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 558 559 // Reinsert signs 560 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 561 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 562 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 563 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 564 565 _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); 566 _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 567 568 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 569 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 570 571 _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); 572 _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 573 } 574 575 { 576 // Scan for eob 577 __m128i zero_coeff0, zero_coeff1; 578 __m128i nzero_coeff0, nzero_coeff1; 579 __m128i iscan0, iscan1; 580 __m128i eob0, eob1; 581 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 582 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 583 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 584 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 585 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 586 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 587 // Add one to convert from indices to counts 588 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 589 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 590 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 591 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 592 eob0 = _mm_max_epi16(eob0, eob1); 593 eob = _mm_max_epi16(eob, eob0); 594 } 595 n_coeffs += 8 * 2; 596 index += 2; 597 } 598 599 // Accumulate EOB 600 { 601 __m128i eob_shuffled; 602 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 603 eob = _mm_max_epi16(eob, eob_shuffled); 604 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 605 eob = _mm_max_epi16(eob, eob_shuffled); 606 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 607 eob = _mm_max_epi16(eob, eob_shuffled); 608 *eob_ptr = _mm_extract_epi16(eob, 1); 609 } 610 } else { 611 do { 612 _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); 613 _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); 614 _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); 615 _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); 616 n_coeffs += 8 * 2; 617 } while (n_coeffs < 0); 618 *eob_ptr = 0; 619 } 620 } 621 622 // load 8x8 array 623 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, 624 int stride) { 625 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 626 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 627 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 628 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 629 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 630 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 631 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 632 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 633 634 in[0] = _mm_slli_epi16(in[0], 2); 635 in[1] = _mm_slli_epi16(in[1], 2); 636 in[2] = _mm_slli_epi16(in[2], 2); 637 in[3] = _mm_slli_epi16(in[3], 2); 638 in[4] = _mm_slli_epi16(in[4], 2); 639 in[5] = _mm_slli_epi16(in[5], 2); 640 in[6] = _mm_slli_epi16(in[6], 2); 641 in[7] = _mm_slli_epi16(in[7], 2); 642 } 643 644 // right shift and rounding 645 static INLINE void right_shift_8x8(__m128i *res, const int bit) { 646 __m128i sign0 = _mm_srai_epi16(res[0], 15); 647 __m128i sign1 = _mm_srai_epi16(res[1], 15); 648 __m128i sign2 = _mm_srai_epi16(res[2], 15); 649 __m128i sign3 = _mm_srai_epi16(res[3], 15); 650 __m128i sign4 = _mm_srai_epi16(res[4], 15); 651 __m128i sign5 = _mm_srai_epi16(res[5], 15); 652 __m128i sign6 = _mm_srai_epi16(res[6], 15); 653 __m128i sign7 = _mm_srai_epi16(res[7], 15); 654 655 if (bit == 2) { 656 const __m128i const_rounding = _mm_set1_epi16(1); 657 res[0] = _mm_add_epi16(res[0], const_rounding); 658 res[1] = _mm_add_epi16(res[1], const_rounding); 659 res[2] = _mm_add_epi16(res[2], const_rounding); 660 res[3] = _mm_add_epi16(res[3], const_rounding); 661 res[4] = _mm_add_epi16(res[4], const_rounding); 662 res[5] = _mm_add_epi16(res[5], const_rounding); 663 res[6] = _mm_add_epi16(res[6], const_rounding); 664 res[7] = _mm_add_epi16(res[7], const_rounding); 665 } 666 667 res[0] = _mm_sub_epi16(res[0], sign0); 668 res[1] = _mm_sub_epi16(res[1], sign1); 669 res[2] = _mm_sub_epi16(res[2], sign2); 670 res[3] = _mm_sub_epi16(res[3], sign3); 671 res[4] = _mm_sub_epi16(res[4], sign4); 672 res[5] = _mm_sub_epi16(res[5], sign5); 673 res[6] = _mm_sub_epi16(res[6], sign6); 674 res[7] = _mm_sub_epi16(res[7], sign7); 675 676 if (bit == 1) { 677 res[0] = _mm_srai_epi16(res[0], 1); 678 res[1] = _mm_srai_epi16(res[1], 1); 679 res[2] = _mm_srai_epi16(res[2], 1); 680 res[3] = _mm_srai_epi16(res[3], 1); 681 res[4] = _mm_srai_epi16(res[4], 1); 682 res[5] = _mm_srai_epi16(res[5], 1); 683 res[6] = _mm_srai_epi16(res[6], 1); 684 res[7] = _mm_srai_epi16(res[7], 1); 685 } else { 686 res[0] = _mm_srai_epi16(res[0], 2); 687 res[1] = _mm_srai_epi16(res[1], 2); 688 res[2] = _mm_srai_epi16(res[2], 2); 689 res[3] = _mm_srai_epi16(res[3], 2); 690 res[4] = _mm_srai_epi16(res[4], 2); 691 res[5] = _mm_srai_epi16(res[5], 2); 692 res[6] = _mm_srai_epi16(res[6], 2); 693 res[7] = _mm_srai_epi16(res[7], 2); 694 } 695 } 696 697 // write 8x8 array 698 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, 699 int stride) { 700 store_output(&res[0], (output + 0 * stride)); 701 store_output(&res[1], (output + 1 * stride)); 702 store_output(&res[2], (output + 2 * stride)); 703 store_output(&res[3], (output + 3 * stride)); 704 store_output(&res[4], (output + 4 * stride)); 705 store_output(&res[5], (output + 5 * stride)); 706 store_output(&res[6], (output + 6 * stride)); 707 store_output(&res[7], (output + 7 * stride)); 708 } 709 710 static void fdct8_sse2(__m128i *in) { 711 // constants 712 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 713 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 714 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 715 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 716 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 717 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 718 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 719 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 720 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 721 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 722 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 723 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 724 725 // stage 1 726 s0 = _mm_add_epi16(in[0], in[7]); 727 s1 = _mm_add_epi16(in[1], in[6]); 728 s2 = _mm_add_epi16(in[2], in[5]); 729 s3 = _mm_add_epi16(in[3], in[4]); 730 s4 = _mm_sub_epi16(in[3], in[4]); 731 s5 = _mm_sub_epi16(in[2], in[5]); 732 s6 = _mm_sub_epi16(in[1], in[6]); 733 s7 = _mm_sub_epi16(in[0], in[7]); 734 735 u0 = _mm_add_epi16(s0, s3); 736 u1 = _mm_add_epi16(s1, s2); 737 u2 = _mm_sub_epi16(s1, s2); 738 u3 = _mm_sub_epi16(s0, s3); 739 // interleave and perform butterfly multiplication/addition 740 v0 = _mm_unpacklo_epi16(u0, u1); 741 v1 = _mm_unpackhi_epi16(u0, u1); 742 v2 = _mm_unpacklo_epi16(u2, u3); 743 v3 = _mm_unpackhi_epi16(u2, u3); 744 745 u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 746 u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 747 u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 748 u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 749 u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 750 u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 751 u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 752 u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 753 754 // shift and rounding 755 v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 756 v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 757 v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 758 v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 759 v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 760 v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 761 v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 762 v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 763 764 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 765 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 766 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 767 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 768 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 769 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 770 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 771 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 772 773 in[0] = _mm_packs_epi32(u0, u1); 774 in[2] = _mm_packs_epi32(u4, u5); 775 in[4] = _mm_packs_epi32(u2, u3); 776 in[6] = _mm_packs_epi32(u6, u7); 777 778 // stage 2 779 // interleave and perform butterfly multiplication/addition 780 u0 = _mm_unpacklo_epi16(s6, s5); 781 u1 = _mm_unpackhi_epi16(s6, s5); 782 v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 783 v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 784 v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 785 v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 786 787 // shift and rounding 788 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 789 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 790 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 791 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 792 793 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 794 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 795 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 796 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 797 798 u0 = _mm_packs_epi32(v0, v1); 799 u1 = _mm_packs_epi32(v2, v3); 800 801 // stage 3 802 s0 = _mm_add_epi16(s4, u0); 803 s1 = _mm_sub_epi16(s4, u0); 804 s2 = _mm_sub_epi16(s7, u1); 805 s3 = _mm_add_epi16(s7, u1); 806 807 // stage 4 808 u0 = _mm_unpacklo_epi16(s0, s3); 809 u1 = _mm_unpackhi_epi16(s0, s3); 810 u2 = _mm_unpacklo_epi16(s1, s2); 811 u3 = _mm_unpackhi_epi16(s1, s2); 812 813 v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 814 v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 815 v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 816 v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 817 v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 818 v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 819 v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 820 v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 821 822 // shift and rounding 823 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 824 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 825 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 826 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 827 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 828 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 829 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 830 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 831 832 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 833 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 834 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 835 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 836 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 837 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 838 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 839 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 840 841 in[1] = _mm_packs_epi32(v0, v1); 842 in[3] = _mm_packs_epi32(v4, v5); 843 in[5] = _mm_packs_epi32(v2, v3); 844 in[7] = _mm_packs_epi32(v6, v7); 845 846 // transpose 847 transpose_16bit_8x8(in, in); 848 } 849 850 static void fadst8_sse2(__m128i *in) { 851 // Constants 852 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 853 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 854 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 855 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 856 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 857 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 858 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 859 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 860 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 861 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 862 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 863 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 864 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 865 const __m128i k__const_0 = _mm_set1_epi16(0); 866 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 867 868 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 869 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 870 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 871 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 872 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 873 874 // properly aligned for butterfly input 875 in0 = in[7]; 876 in1 = in[0]; 877 in2 = in[5]; 878 in3 = in[2]; 879 in4 = in[3]; 880 in5 = in[4]; 881 in6 = in[1]; 882 in7 = in[6]; 883 884 // column transformation 885 // stage 1 886 // interleave and multiply/add into 32-bit integer 887 s0 = _mm_unpacklo_epi16(in0, in1); 888 s1 = _mm_unpackhi_epi16(in0, in1); 889 s2 = _mm_unpacklo_epi16(in2, in3); 890 s3 = _mm_unpackhi_epi16(in2, in3); 891 s4 = _mm_unpacklo_epi16(in4, in5); 892 s5 = _mm_unpackhi_epi16(in4, in5); 893 s6 = _mm_unpacklo_epi16(in6, in7); 894 s7 = _mm_unpackhi_epi16(in6, in7); 895 896 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 897 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 898 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 899 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 900 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 901 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 902 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 903 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 904 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 905 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 906 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 907 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 908 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 909 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 910 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 911 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 912 913 // addition 914 w0 = _mm_add_epi32(u0, u8); 915 w1 = _mm_add_epi32(u1, u9); 916 w2 = _mm_add_epi32(u2, u10); 917 w3 = _mm_add_epi32(u3, u11); 918 w4 = _mm_add_epi32(u4, u12); 919 w5 = _mm_add_epi32(u5, u13); 920 w6 = _mm_add_epi32(u6, u14); 921 w7 = _mm_add_epi32(u7, u15); 922 w8 = _mm_sub_epi32(u0, u8); 923 w9 = _mm_sub_epi32(u1, u9); 924 w10 = _mm_sub_epi32(u2, u10); 925 w11 = _mm_sub_epi32(u3, u11); 926 w12 = _mm_sub_epi32(u4, u12); 927 w13 = _mm_sub_epi32(u5, u13); 928 w14 = _mm_sub_epi32(u6, u14); 929 w15 = _mm_sub_epi32(u7, u15); 930 931 // shift and rounding 932 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 933 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 934 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 935 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 936 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 937 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 938 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 939 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 940 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 941 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 942 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 943 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 944 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 945 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 946 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 947 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 948 949 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 950 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 951 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 952 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 953 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 954 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 955 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 956 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 957 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 958 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 959 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 960 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 961 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 962 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 963 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 964 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 965 966 // back to 16-bit and pack 8 integers into __m128i 967 in[0] = _mm_packs_epi32(u0, u1); 968 in[1] = _mm_packs_epi32(u2, u3); 969 in[2] = _mm_packs_epi32(u4, u5); 970 in[3] = _mm_packs_epi32(u6, u7); 971 in[4] = _mm_packs_epi32(u8, u9); 972 in[5] = _mm_packs_epi32(u10, u11); 973 in[6] = _mm_packs_epi32(u12, u13); 974 in[7] = _mm_packs_epi32(u14, u15); 975 976 // stage 2 977 s0 = _mm_add_epi16(in[0], in[2]); 978 s1 = _mm_add_epi16(in[1], in[3]); 979 s2 = _mm_sub_epi16(in[0], in[2]); 980 s3 = _mm_sub_epi16(in[1], in[3]); 981 u0 = _mm_unpacklo_epi16(in[4], in[5]); 982 u1 = _mm_unpackhi_epi16(in[4], in[5]); 983 u2 = _mm_unpacklo_epi16(in[6], in[7]); 984 u3 = _mm_unpackhi_epi16(in[6], in[7]); 985 986 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 987 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 988 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 989 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 990 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 991 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 992 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 993 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 994 995 w0 = _mm_add_epi32(v0, v4); 996 w1 = _mm_add_epi32(v1, v5); 997 w2 = _mm_add_epi32(v2, v6); 998 w3 = _mm_add_epi32(v3, v7); 999 w4 = _mm_sub_epi32(v0, v4); 1000 w5 = _mm_sub_epi32(v1, v5); 1001 w6 = _mm_sub_epi32(v2, v6); 1002 w7 = _mm_sub_epi32(v3, v7); 1003 1004 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1005 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1006 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1007 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1008 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1009 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1010 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1011 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1012 1013 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1014 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1015 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1016 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1017 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1018 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1019 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1020 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1021 1022 // back to 16-bit intergers 1023 s4 = _mm_packs_epi32(u0, u1); 1024 s5 = _mm_packs_epi32(u2, u3); 1025 s6 = _mm_packs_epi32(u4, u5); 1026 s7 = _mm_packs_epi32(u6, u7); 1027 1028 // stage 3 1029 u0 = _mm_unpacklo_epi16(s2, s3); 1030 u1 = _mm_unpackhi_epi16(s2, s3); 1031 u2 = _mm_unpacklo_epi16(s6, s7); 1032 u3 = _mm_unpackhi_epi16(s6, s7); 1033 1034 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1035 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1036 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1037 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1038 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 1039 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 1040 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 1041 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 1042 1043 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1044 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1045 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1046 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1047 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1048 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1049 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1050 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1051 1052 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1053 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1054 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1055 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1056 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1057 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1058 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1059 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1060 1061 s2 = _mm_packs_epi32(v0, v1); 1062 s3 = _mm_packs_epi32(v2, v3); 1063 s6 = _mm_packs_epi32(v4, v5); 1064 s7 = _mm_packs_epi32(v6, v7); 1065 1066 // FIXME(jingning): do subtract using bit inversion? 1067 in[0] = s0; 1068 in[1] = _mm_sub_epi16(k__const_0, s4); 1069 in[2] = s6; 1070 in[3] = _mm_sub_epi16(k__const_0, s2); 1071 in[4] = s3; 1072 in[5] = _mm_sub_epi16(k__const_0, s7); 1073 in[6] = s5; 1074 in[7] = _mm_sub_epi16(k__const_0, s1); 1075 1076 // transpose 1077 transpose_16bit_8x8(in, in); 1078 } 1079 1080 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, 1081 int tx_type) { 1082 __m128i in[8]; 1083 1084 switch (tx_type) { 1085 case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break; 1086 case ADST_DCT: 1087 load_buffer_8x8(input, in, stride); 1088 fadst8_sse2(in); 1089 fdct8_sse2(in); 1090 right_shift_8x8(in, 1); 1091 write_buffer_8x8(output, in, 8); 1092 break; 1093 case DCT_ADST: 1094 load_buffer_8x8(input, in, stride); 1095 fdct8_sse2(in); 1096 fadst8_sse2(in); 1097 right_shift_8x8(in, 1); 1098 write_buffer_8x8(output, in, 8); 1099 break; 1100 case ADST_ADST: 1101 load_buffer_8x8(input, in, stride); 1102 fadst8_sse2(in); 1103 fadst8_sse2(in); 1104 right_shift_8x8(in, 1); 1105 write_buffer_8x8(output, in, 8); 1106 break; 1107 default: assert(0); break; 1108 } 1109 } 1110 1111 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, 1112 __m128i *in1, int stride) { 1113 // load first 8 columns 1114 load_buffer_8x8(input, in0, stride); 1115 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 1116 1117 input += 8; 1118 // load second 8 columns 1119 load_buffer_8x8(input, in1, stride); 1120 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 1121 } 1122 1123 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, 1124 __m128i *in1, int stride) { 1125 // write first 8 columns 1126 write_buffer_8x8(output, in0, stride); 1127 write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 1128 // write second 8 columns 1129 output += 8; 1130 write_buffer_8x8(output, in1, stride); 1131 write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 1132 } 1133 1134 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 1135 // perform rounding operations 1136 right_shift_8x8(res0, 2); 1137 right_shift_8x8(res0 + 8, 2); 1138 right_shift_8x8(res1, 2); 1139 right_shift_8x8(res1 + 8, 2); 1140 } 1141 1142 static void fdct16_8col(__m128i *in) { 1143 // perform 16x16 1-D DCT for 8 columns 1144 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1145 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1146 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1147 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1148 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1149 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); 1150 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1151 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1152 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1153 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1154 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1155 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1156 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1157 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1158 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1159 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1160 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1161 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1162 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1163 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1164 1165 // stage 1 1166 i[0] = _mm_add_epi16(in[0], in[15]); 1167 i[1] = _mm_add_epi16(in[1], in[14]); 1168 i[2] = _mm_add_epi16(in[2], in[13]); 1169 i[3] = _mm_add_epi16(in[3], in[12]); 1170 i[4] = _mm_add_epi16(in[4], in[11]); 1171 i[5] = _mm_add_epi16(in[5], in[10]); 1172 i[6] = _mm_add_epi16(in[6], in[9]); 1173 i[7] = _mm_add_epi16(in[7], in[8]); 1174 1175 s[0] = _mm_sub_epi16(in[7], in[8]); 1176 s[1] = _mm_sub_epi16(in[6], in[9]); 1177 s[2] = _mm_sub_epi16(in[5], in[10]); 1178 s[3] = _mm_sub_epi16(in[4], in[11]); 1179 s[4] = _mm_sub_epi16(in[3], in[12]); 1180 s[5] = _mm_sub_epi16(in[2], in[13]); 1181 s[6] = _mm_sub_epi16(in[1], in[14]); 1182 s[7] = _mm_sub_epi16(in[0], in[15]); 1183 1184 p[0] = _mm_add_epi16(i[0], i[7]); 1185 p[1] = _mm_add_epi16(i[1], i[6]); 1186 p[2] = _mm_add_epi16(i[2], i[5]); 1187 p[3] = _mm_add_epi16(i[3], i[4]); 1188 p[4] = _mm_sub_epi16(i[3], i[4]); 1189 p[5] = _mm_sub_epi16(i[2], i[5]); 1190 p[6] = _mm_sub_epi16(i[1], i[6]); 1191 p[7] = _mm_sub_epi16(i[0], i[7]); 1192 1193 u[0] = _mm_add_epi16(p[0], p[3]); 1194 u[1] = _mm_add_epi16(p[1], p[2]); 1195 u[2] = _mm_sub_epi16(p[1], p[2]); 1196 u[3] = _mm_sub_epi16(p[0], p[3]); 1197 1198 v[0] = _mm_unpacklo_epi16(u[0], u[1]); 1199 v[1] = _mm_unpackhi_epi16(u[0], u[1]); 1200 v[2] = _mm_unpacklo_epi16(u[2], u[3]); 1201 v[3] = _mm_unpackhi_epi16(u[2], u[3]); 1202 1203 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 1204 u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 1205 u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 1206 u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 1207 u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 1208 u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 1209 u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 1210 u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 1211 1212 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1213 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1214 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1215 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1216 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1217 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1218 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1219 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1220 1221 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1222 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1223 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1224 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1225 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1226 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1227 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1228 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1229 1230 in[0] = _mm_packs_epi32(u[0], u[1]); 1231 in[4] = _mm_packs_epi32(u[4], u[5]); 1232 in[8] = _mm_packs_epi32(u[2], u[3]); 1233 in[12] = _mm_packs_epi32(u[6], u[7]); 1234 1235 u[0] = _mm_unpacklo_epi16(p[5], p[6]); 1236 u[1] = _mm_unpackhi_epi16(p[5], p[6]); 1237 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1238 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1239 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1240 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1241 1242 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1243 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1244 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1245 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1246 1247 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1248 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1249 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1250 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1251 1252 u[0] = _mm_packs_epi32(v[0], v[1]); 1253 u[1] = _mm_packs_epi32(v[2], v[3]); 1254 1255 t[0] = _mm_add_epi16(p[4], u[0]); 1256 t[1] = _mm_sub_epi16(p[4], u[0]); 1257 t[2] = _mm_sub_epi16(p[7], u[1]); 1258 t[3] = _mm_add_epi16(p[7], u[1]); 1259 1260 u[0] = _mm_unpacklo_epi16(t[0], t[3]); 1261 u[1] = _mm_unpackhi_epi16(t[0], t[3]); 1262 u[2] = _mm_unpacklo_epi16(t[1], t[2]); 1263 u[3] = _mm_unpackhi_epi16(t[1], t[2]); 1264 1265 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 1266 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 1267 v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 1268 v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 1269 v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 1270 v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 1271 v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 1272 v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 1273 1274 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1275 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1276 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1277 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1278 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1279 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1280 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1281 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1282 1283 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1284 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1285 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1286 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1287 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1288 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1289 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1290 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1291 1292 in[2] = _mm_packs_epi32(v[0], v[1]); 1293 in[6] = _mm_packs_epi32(v[4], v[5]); 1294 in[10] = _mm_packs_epi32(v[2], v[3]); 1295 in[14] = _mm_packs_epi32(v[6], v[7]); 1296 1297 // stage 2 1298 u[0] = _mm_unpacklo_epi16(s[2], s[5]); 1299 u[1] = _mm_unpackhi_epi16(s[2], s[5]); 1300 u[2] = _mm_unpacklo_epi16(s[3], s[4]); 1301 u[3] = _mm_unpackhi_epi16(s[3], s[4]); 1302 1303 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1304 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1305 v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1306 v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1307 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1308 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1309 v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1310 v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1311 1312 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1313 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1314 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1315 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1316 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1317 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1318 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1319 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1320 1321 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1322 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1323 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1324 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1325 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1326 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1327 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1328 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1329 1330 t[2] = _mm_packs_epi32(v[0], v[1]); 1331 t[3] = _mm_packs_epi32(v[2], v[3]); 1332 t[4] = _mm_packs_epi32(v[4], v[5]); 1333 t[5] = _mm_packs_epi32(v[6], v[7]); 1334 1335 // stage 3 1336 p[0] = _mm_add_epi16(s[0], t[3]); 1337 p[1] = _mm_add_epi16(s[1], t[2]); 1338 p[2] = _mm_sub_epi16(s[1], t[2]); 1339 p[3] = _mm_sub_epi16(s[0], t[3]); 1340 p[4] = _mm_sub_epi16(s[7], t[4]); 1341 p[5] = _mm_sub_epi16(s[6], t[5]); 1342 p[6] = _mm_add_epi16(s[6], t[5]); 1343 p[7] = _mm_add_epi16(s[7], t[4]); 1344 1345 // stage 4 1346 u[0] = _mm_unpacklo_epi16(p[1], p[6]); 1347 u[1] = _mm_unpackhi_epi16(p[1], p[6]); 1348 u[2] = _mm_unpacklo_epi16(p[2], p[5]); 1349 u[3] = _mm_unpackhi_epi16(p[2], p[5]); 1350 1351 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 1352 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 1353 v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); 1354 v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); 1355 v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); 1356 v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); 1357 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 1358 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 1359 1360 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1361 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1362 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1363 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1364 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1365 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1366 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1367 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1368 1369 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1370 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1371 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1372 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1373 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1374 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1375 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1376 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1377 1378 t[1] = _mm_packs_epi32(v[0], v[1]); 1379 t[2] = _mm_packs_epi32(v[2], v[3]); 1380 t[5] = _mm_packs_epi32(v[4], v[5]); 1381 t[6] = _mm_packs_epi32(v[6], v[7]); 1382 1383 // stage 5 1384 s[0] = _mm_add_epi16(p[0], t[1]); 1385 s[1] = _mm_sub_epi16(p[0], t[1]); 1386 s[2] = _mm_add_epi16(p[3], t[2]); 1387 s[3] = _mm_sub_epi16(p[3], t[2]); 1388 s[4] = _mm_sub_epi16(p[4], t[5]); 1389 s[5] = _mm_add_epi16(p[4], t[5]); 1390 s[6] = _mm_sub_epi16(p[7], t[6]); 1391 s[7] = _mm_add_epi16(p[7], t[6]); 1392 1393 // stage 6 1394 u[0] = _mm_unpacklo_epi16(s[0], s[7]); 1395 u[1] = _mm_unpackhi_epi16(s[0], s[7]); 1396 u[2] = _mm_unpacklo_epi16(s[1], s[6]); 1397 u[3] = _mm_unpackhi_epi16(s[1], s[6]); 1398 u[4] = _mm_unpacklo_epi16(s[2], s[5]); 1399 u[5] = _mm_unpackhi_epi16(s[2], s[5]); 1400 u[6] = _mm_unpacklo_epi16(s[3], s[4]); 1401 u[7] = _mm_unpackhi_epi16(s[3], s[4]); 1402 1403 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 1404 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 1405 v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 1406 v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 1407 v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 1408 v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 1409 v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 1410 v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 1411 v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 1412 v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 1413 v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 1414 v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 1415 v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 1416 v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 1417 v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 1418 v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 1419 1420 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1421 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1422 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1423 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1424 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1425 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1426 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1427 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1428 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1429 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1430 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1431 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1432 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1433 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1434 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1435 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1436 1437 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1438 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1439 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1440 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1441 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1442 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1443 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1444 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1445 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1446 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1447 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1448 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1449 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1450 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1451 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1452 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1453 1454 in[1] = _mm_packs_epi32(v[0], v[1]); 1455 in[9] = _mm_packs_epi32(v[2], v[3]); 1456 in[5] = _mm_packs_epi32(v[4], v[5]); 1457 in[13] = _mm_packs_epi32(v[6], v[7]); 1458 in[3] = _mm_packs_epi32(v[8], v[9]); 1459 in[11] = _mm_packs_epi32(v[10], v[11]); 1460 in[7] = _mm_packs_epi32(v[12], v[13]); 1461 in[15] = _mm_packs_epi32(v[14], v[15]); 1462 } 1463 1464 static void fadst16_8col(__m128i *in) { 1465 // perform 16x16 1-D ADST for 8 columns 1466 __m128i s[16], x[16], u[32], v[32]; 1467 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1468 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1469 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1470 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1471 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1472 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1473 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1474 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1475 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1476 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1477 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1478 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1479 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1480 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1481 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1482 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1483 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1484 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1485 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1486 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1487 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1488 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1489 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1490 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1491 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1492 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1493 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1494 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1495 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1496 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1497 const __m128i kZero = _mm_set1_epi16(0); 1498 1499 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1500 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1501 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1502 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1503 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1504 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1505 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1506 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1507 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1508 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1509 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1510 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1511 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1512 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1513 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1514 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1515 1516 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1517 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1518 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1519 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1520 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1521 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1522 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1523 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1524 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1525 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1526 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1527 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1528 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1529 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1530 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1531 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1532 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1533 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1534 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1535 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1536 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1537 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1538 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1539 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1540 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1541 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1542 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1543 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1544 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1545 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1546 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1547 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1548 1549 u[0] = _mm_add_epi32(v[0], v[16]); 1550 u[1] = _mm_add_epi32(v[1], v[17]); 1551 u[2] = _mm_add_epi32(v[2], v[18]); 1552 u[3] = _mm_add_epi32(v[3], v[19]); 1553 u[4] = _mm_add_epi32(v[4], v[20]); 1554 u[5] = _mm_add_epi32(v[5], v[21]); 1555 u[6] = _mm_add_epi32(v[6], v[22]); 1556 u[7] = _mm_add_epi32(v[7], v[23]); 1557 u[8] = _mm_add_epi32(v[8], v[24]); 1558 u[9] = _mm_add_epi32(v[9], v[25]); 1559 u[10] = _mm_add_epi32(v[10], v[26]); 1560 u[11] = _mm_add_epi32(v[11], v[27]); 1561 u[12] = _mm_add_epi32(v[12], v[28]); 1562 u[13] = _mm_add_epi32(v[13], v[29]); 1563 u[14] = _mm_add_epi32(v[14], v[30]); 1564 u[15] = _mm_add_epi32(v[15], v[31]); 1565 u[16] = _mm_sub_epi32(v[0], v[16]); 1566 u[17] = _mm_sub_epi32(v[1], v[17]); 1567 u[18] = _mm_sub_epi32(v[2], v[18]); 1568 u[19] = _mm_sub_epi32(v[3], v[19]); 1569 u[20] = _mm_sub_epi32(v[4], v[20]); 1570 u[21] = _mm_sub_epi32(v[5], v[21]); 1571 u[22] = _mm_sub_epi32(v[6], v[22]); 1572 u[23] = _mm_sub_epi32(v[7], v[23]); 1573 u[24] = _mm_sub_epi32(v[8], v[24]); 1574 u[25] = _mm_sub_epi32(v[9], v[25]); 1575 u[26] = _mm_sub_epi32(v[10], v[26]); 1576 u[27] = _mm_sub_epi32(v[11], v[27]); 1577 u[28] = _mm_sub_epi32(v[12], v[28]); 1578 u[29] = _mm_sub_epi32(v[13], v[29]); 1579 u[30] = _mm_sub_epi32(v[14], v[30]); 1580 u[31] = _mm_sub_epi32(v[15], v[31]); 1581 1582 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1583 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1584 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1585 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1586 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1587 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1588 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1589 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1590 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1591 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1592 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1593 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1594 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1595 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1596 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1597 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1598 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1599 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1600 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1601 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1602 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1603 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1604 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1605 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1606 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1607 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1608 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1609 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1610 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1611 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1612 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1613 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1614 1615 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1616 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1617 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1618 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1619 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1620 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1621 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1622 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1623 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1624 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1625 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1626 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1627 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1628 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1629 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1630 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1631 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1632 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1633 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1634 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1635 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1636 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1637 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1638 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1639 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1640 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1641 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1642 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1643 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1644 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1645 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1646 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1647 1648 s[0] = _mm_packs_epi32(u[0], u[1]); 1649 s[1] = _mm_packs_epi32(u[2], u[3]); 1650 s[2] = _mm_packs_epi32(u[4], u[5]); 1651 s[3] = _mm_packs_epi32(u[6], u[7]); 1652 s[4] = _mm_packs_epi32(u[8], u[9]); 1653 s[5] = _mm_packs_epi32(u[10], u[11]); 1654 s[6] = _mm_packs_epi32(u[12], u[13]); 1655 s[7] = _mm_packs_epi32(u[14], u[15]); 1656 s[8] = _mm_packs_epi32(u[16], u[17]); 1657 s[9] = _mm_packs_epi32(u[18], u[19]); 1658 s[10] = _mm_packs_epi32(u[20], u[21]); 1659 s[11] = _mm_packs_epi32(u[22], u[23]); 1660 s[12] = _mm_packs_epi32(u[24], u[25]); 1661 s[13] = _mm_packs_epi32(u[26], u[27]); 1662 s[14] = _mm_packs_epi32(u[28], u[29]); 1663 s[15] = _mm_packs_epi32(u[30], u[31]); 1664 1665 // stage 2 1666 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1667 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1668 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1669 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1670 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1671 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1672 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1673 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1674 1675 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1676 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1677 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1678 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1679 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1680 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1681 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1682 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1683 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1684 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1685 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1686 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1687 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1688 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1689 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1690 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1691 1692 u[0] = _mm_add_epi32(v[0], v[8]); 1693 u[1] = _mm_add_epi32(v[1], v[9]); 1694 u[2] = _mm_add_epi32(v[2], v[10]); 1695 u[3] = _mm_add_epi32(v[3], v[11]); 1696 u[4] = _mm_add_epi32(v[4], v[12]); 1697 u[5] = _mm_add_epi32(v[5], v[13]); 1698 u[6] = _mm_add_epi32(v[6], v[14]); 1699 u[7] = _mm_add_epi32(v[7], v[15]); 1700 u[8] = _mm_sub_epi32(v[0], v[8]); 1701 u[9] = _mm_sub_epi32(v[1], v[9]); 1702 u[10] = _mm_sub_epi32(v[2], v[10]); 1703 u[11] = _mm_sub_epi32(v[3], v[11]); 1704 u[12] = _mm_sub_epi32(v[4], v[12]); 1705 u[13] = _mm_sub_epi32(v[5], v[13]); 1706 u[14] = _mm_sub_epi32(v[6], v[14]); 1707 u[15] = _mm_sub_epi32(v[7], v[15]); 1708 1709 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1710 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1711 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1712 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1713 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1714 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1715 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1716 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1717 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1718 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1719 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1720 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1721 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1722 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1723 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1724 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1725 1726 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1727 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1728 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1729 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1730 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1731 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1732 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1733 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1734 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1735 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1736 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1737 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1738 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1739 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1740 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1741 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1742 1743 x[0] = _mm_add_epi16(s[0], s[4]); 1744 x[1] = _mm_add_epi16(s[1], s[5]); 1745 x[2] = _mm_add_epi16(s[2], s[6]); 1746 x[3] = _mm_add_epi16(s[3], s[7]); 1747 x[4] = _mm_sub_epi16(s[0], s[4]); 1748 x[5] = _mm_sub_epi16(s[1], s[5]); 1749 x[6] = _mm_sub_epi16(s[2], s[6]); 1750 x[7] = _mm_sub_epi16(s[3], s[7]); 1751 x[8] = _mm_packs_epi32(u[0], u[1]); 1752 x[9] = _mm_packs_epi32(u[2], u[3]); 1753 x[10] = _mm_packs_epi32(u[4], u[5]); 1754 x[11] = _mm_packs_epi32(u[6], u[7]); 1755 x[12] = _mm_packs_epi32(u[8], u[9]); 1756 x[13] = _mm_packs_epi32(u[10], u[11]); 1757 x[14] = _mm_packs_epi32(u[12], u[13]); 1758 x[15] = _mm_packs_epi32(u[14], u[15]); 1759 1760 // stage 3 1761 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1762 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1763 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1764 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1765 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1766 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1767 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1768 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1769 1770 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1771 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1772 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1773 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1774 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1775 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1776 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1777 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1778 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1779 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1780 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1781 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1782 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1783 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1784 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1785 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1786 1787 u[0] = _mm_add_epi32(v[0], v[4]); 1788 u[1] = _mm_add_epi32(v[1], v[5]); 1789 u[2] = _mm_add_epi32(v[2], v[6]); 1790 u[3] = _mm_add_epi32(v[3], v[7]); 1791 u[4] = _mm_sub_epi32(v[0], v[4]); 1792 u[5] = _mm_sub_epi32(v[1], v[5]); 1793 u[6] = _mm_sub_epi32(v[2], v[6]); 1794 u[7] = _mm_sub_epi32(v[3], v[7]); 1795 u[8] = _mm_add_epi32(v[8], v[12]); 1796 u[9] = _mm_add_epi32(v[9], v[13]); 1797 u[10] = _mm_add_epi32(v[10], v[14]); 1798 u[11] = _mm_add_epi32(v[11], v[15]); 1799 u[12] = _mm_sub_epi32(v[8], v[12]); 1800 u[13] = _mm_sub_epi32(v[9], v[13]); 1801 u[14] = _mm_sub_epi32(v[10], v[14]); 1802 u[15] = _mm_sub_epi32(v[11], v[15]); 1803 1804 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1805 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1806 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1807 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1808 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1809 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1810 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1811 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1812 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1813 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1814 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1815 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1816 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1817 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1818 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1819 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1820 1821 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1822 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1823 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1824 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1825 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1826 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1827 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1828 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1829 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1830 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1831 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1832 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1833 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1834 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1835 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1836 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1837 1838 s[0] = _mm_add_epi16(x[0], x[2]); 1839 s[1] = _mm_add_epi16(x[1], x[3]); 1840 s[2] = _mm_sub_epi16(x[0], x[2]); 1841 s[3] = _mm_sub_epi16(x[1], x[3]); 1842 s[4] = _mm_packs_epi32(v[0], v[1]); 1843 s[5] = _mm_packs_epi32(v[2], v[3]); 1844 s[6] = _mm_packs_epi32(v[4], v[5]); 1845 s[7] = _mm_packs_epi32(v[6], v[7]); 1846 s[8] = _mm_add_epi16(x[8], x[10]); 1847 s[9] = _mm_add_epi16(x[9], x[11]); 1848 s[10] = _mm_sub_epi16(x[8], x[10]); 1849 s[11] = _mm_sub_epi16(x[9], x[11]); 1850 s[12] = _mm_packs_epi32(v[8], v[9]); 1851 s[13] = _mm_packs_epi32(v[10], v[11]); 1852 s[14] = _mm_packs_epi32(v[12], v[13]); 1853 s[15] = _mm_packs_epi32(v[14], v[15]); 1854 1855 // stage 4 1856 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1857 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1858 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1859 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1860 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1861 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1862 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1863 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1864 1865 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1866 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1867 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1868 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1869 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1870 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1871 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1872 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1873 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1874 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1875 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1876 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1877 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1878 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1879 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1880 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1881 1882 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1883 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1884 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1885 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1886 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1887 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1888 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1889 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1890 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1891 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1892 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1893 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1894 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1895 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1896 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1897 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1898 1899 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1900 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1901 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1902 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1903 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1904 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1905 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1906 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1907 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1908 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1909 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1910 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1911 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1912 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1913 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1914 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1915 1916 in[0] = s[0]; 1917 in[1] = _mm_sub_epi16(kZero, s[8]); 1918 in[2] = s[12]; 1919 in[3] = _mm_sub_epi16(kZero, s[4]); 1920 in[4] = _mm_packs_epi32(v[4], v[5]); 1921 in[5] = _mm_packs_epi32(v[12], v[13]); 1922 in[6] = _mm_packs_epi32(v[8], v[9]); 1923 in[7] = _mm_packs_epi32(v[0], v[1]); 1924 in[8] = _mm_packs_epi32(v[2], v[3]); 1925 in[9] = _mm_packs_epi32(v[10], v[11]); 1926 in[10] = _mm_packs_epi32(v[14], v[15]); 1927 in[11] = _mm_packs_epi32(v[6], v[7]); 1928 in[12] = s[5]; 1929 in[13] = _mm_sub_epi16(kZero, s[13]); 1930 in[14] = s[9]; 1931 in[15] = _mm_sub_epi16(kZero, s[1]); 1932 } 1933 1934 static void fdct16_sse2(__m128i *in0, __m128i *in1) { 1935 fdct16_8col(in0); 1936 fdct16_8col(in1); 1937 transpose_16bit_16x16(in0, in1); 1938 } 1939 1940 static void fadst16_sse2(__m128i *in0, __m128i *in1) { 1941 fadst16_8col(in0); 1942 fadst16_8col(in1); 1943 transpose_16bit_16x16(in0, in1); 1944 } 1945 1946 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, 1947 int tx_type) { 1948 __m128i in0[16], in1[16]; 1949 1950 switch (tx_type) { 1951 case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break; 1952 case ADST_DCT: 1953 load_buffer_16x16(input, in0, in1, stride); 1954 fadst16_sse2(in0, in1); 1955 right_shift_16x16(in0, in1); 1956 fdct16_sse2(in0, in1); 1957 write_buffer_16x16(output, in0, in1, 16); 1958 break; 1959 case DCT_ADST: 1960 load_buffer_16x16(input, in0, in1, stride); 1961 fdct16_sse2(in0, in1); 1962 right_shift_16x16(in0, in1); 1963 fadst16_sse2(in0, in1); 1964 write_buffer_16x16(output, in0, in1, 16); 1965 break; 1966 case ADST_ADST: 1967 load_buffer_16x16(input, in0, in1, stride); 1968 fadst16_sse2(in0, in1); 1969 right_shift_16x16(in0, in1); 1970 fadst16_sse2(in0, in1); 1971 write_buffer_16x16(output, in0, in1, 16); 1972 break; 1973 default: assert(0); break; 1974 } 1975 } 1976