1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_ 12 #define VPX_DSP_X86_INV_TXFM_SSSE3_H_ 13 14 #include <tmmintrin.h> 15 16 #include "./vpx_dsp_rtcd.h" 17 #include "vpx_dsp/x86/inv_txfm_sse2.h" 18 #include "vpx_dsp/x86/transpose_sse2.h" 19 #include "vpx_dsp/x86/txfm_common_sse2.h" 20 21 static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { 22 const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); 23 const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); 24 const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); 25 const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); 26 const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 27 const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); 28 const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); 29 const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); 30 const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); 31 const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); 32 const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); 33 const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); 34 __m128i step1[8], step2[8], tmp[4]; 35 36 // pass 1 37 38 transpose_16bit_4x4(io, io); 39 // io[0]: 00 10 20 30 01 11 21 31 40 // io[1]: 02 12 22 32 03 13 23 33 41 42 // stage 1 43 tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); 44 tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); 45 tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); 46 tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); 47 step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 48 step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 49 50 // stage 2 51 step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 52 step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 53 step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 54 step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 55 step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 56 57 // stage 3 58 tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); 59 step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 60 tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 61 tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 62 step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 63 step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 64 65 // stage 4 66 tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 67 tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 68 tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 69 tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 70 71 // pass 2 72 73 idct8x8_12_transpose_16bit_4x8(tmp, io); 74 75 // stage 1 76 step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); 77 step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); 78 step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); 79 step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); 80 81 // stage 2 82 step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] 83 step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); 84 step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); 85 step2[4] = _mm_add_epi16(step1[4], step1[5]); 86 step2[5] = _mm_sub_epi16(step1[4], step1[5]); 87 step2[6] = _mm_sub_epi16(step1[7], step1[6]); 88 step2[7] = _mm_add_epi16(step1[7], step1[6]); 89 90 // stage 3 91 step1[0] = _mm_add_epi16(step2[0], step2[3]); 92 step1[1] = _mm_add_epi16(step2[0], step2[2]); 93 step1[2] = _mm_sub_epi16(step2[0], step2[2]); 94 step1[3] = _mm_sub_epi16(step2[0], step2[3]); 95 butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); 96 97 // stage 4 98 io[0] = _mm_add_epi16(step1[0], step2[7]); 99 io[1] = _mm_add_epi16(step1[1], step1[6]); 100 io[2] = _mm_add_epi16(step1[2], step1[5]); 101 io[3] = _mm_add_epi16(step1[3], step2[4]); 102 io[4] = _mm_sub_epi16(step1[3], step2[4]); 103 io[5] = _mm_sub_epi16(step1[2], step1[5]); 104 io[6] = _mm_sub_epi16(step1[1], step1[6]); 105 io[7] = _mm_sub_epi16(step1[0], step2[7]); 106 } 107 108 void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); 109 110 #endif // VPX_DSP_X86_INV_TXFM_SSSE3_H_ 111