Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
     12 #define VPX_DSP_X86_INV_TXFM_SSSE3_H_
     13 
     14 #include <tmmintrin.h>
     15 
     16 #include "./vpx_dsp_rtcd.h"
     17 #include "vpx_dsp/x86/inv_txfm_sse2.h"
     18 #include "vpx_dsp/x86/transpose_sse2.h"
     19 #include "vpx_dsp/x86/txfm_common_sse2.h"
     20 
     21 static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
     22   const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
     23   const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
     24   const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
     25   const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
     26   const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     27   const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
     28   const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
     29   const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
     30   const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
     31   const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
     32   const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
     33   const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
     34   __m128i step1[8], step2[8], tmp[4];
     35 
     36   // pass 1
     37 
     38   transpose_16bit_4x4(io, io);
     39   // io[0]: 00 10 20 30  01 11 21 31
     40   // io[1]: 02 12 22 32  03 13 23 33
     41 
     42   // stage 1
     43   tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
     44   tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
     45   tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
     46   tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
     47   step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
     48   step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
     49 
     50   // stage 2
     51   step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
     52   step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
     53   step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
     54   step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
     55   step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
     56 
     57   // stage 3
     58   tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
     59   step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
     60   tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
     61   tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
     62   step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
     63   step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
     64 
     65   // stage 4
     66   tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
     67   tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
     68   tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
     69   tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
     70 
     71   // pass 2
     72 
     73   idct8x8_12_transpose_16bit_4x8(tmp, io);
     74 
     75   // stage 1
     76   step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
     77   step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
     78   step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
     79   step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
     80 
     81   // stage 2
     82   step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
     83   step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
     84   step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
     85   step2[4] = _mm_add_epi16(step1[4], step1[5]);
     86   step2[5] = _mm_sub_epi16(step1[4], step1[5]);
     87   step2[6] = _mm_sub_epi16(step1[7], step1[6]);
     88   step2[7] = _mm_add_epi16(step1[7], step1[6]);
     89 
     90   // stage 3
     91   step1[0] = _mm_add_epi16(step2[0], step2[3]);
     92   step1[1] = _mm_add_epi16(step2[0], step2[2]);
     93   step1[2] = _mm_sub_epi16(step2[0], step2[2]);
     94   step1[3] = _mm_sub_epi16(step2[0], step2[3]);
     95   butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
     96 
     97   // stage 4
     98   io[0] = _mm_add_epi16(step1[0], step2[7]);
     99   io[1] = _mm_add_epi16(step1[1], step1[6]);
    100   io[2] = _mm_add_epi16(step1[2], step1[5]);
    101   io[3] = _mm_add_epi16(step1[3], step2[4]);
    102   io[4] = _mm_sub_epi16(step1[3], step2[4]);
    103   io[5] = _mm_sub_epi16(step1[2], step1[5]);
    104   io[6] = _mm_sub_epi16(step1[1], step1[6]);
    105   io[7] = _mm_sub_epi16(step1[0], step2[7]);
    106 }
    107 
    108 void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
    109 
    110 #endif  // VPX_DSP_X86_INV_TXFM_SSSE3_H_
    111