Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
     15 #include "vpx_dsp/x86/inv_txfm_sse2.h"
     16 #include "vpx_dsp/x86/transpose_sse2.h"
     17 #include "vpx_dsp/x86/txfm_common_sse2.h"
     18 
     19 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
     20                                              __m128i *const out) {
     21   // stage 5
     22   out[0] = _mm_add_epi32(in[0], in[3]);
     23   out[1] = _mm_add_epi32(in[1], in[2]);
     24   out[2] = _mm_sub_epi32(in[1], in[2]);
     25   out[3] = _mm_sub_epi32(in[0], in[3]);
     26   highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
     27   out[8] = _mm_add_epi32(in[8], in[11]);
     28   out[9] = _mm_add_epi32(in[9], in[10]);
     29   out[10] = _mm_sub_epi32(in[9], in[10]);
     30   out[11] = _mm_sub_epi32(in[8], in[11]);
     31   out[12] = _mm_sub_epi32(in[15], in[12]);
     32   out[13] = _mm_sub_epi32(in[14], in[13]);
     33   out[14] = _mm_add_epi32(in[14], in[13]);
     34   out[15] = _mm_add_epi32(in[15], in[12]);
     35 }
     36 
     37 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
     38                                              __m128i *const out) {
     39   out[0] = _mm_add_epi32(in[0], in[7]);
     40   out[1] = _mm_add_epi32(in[1], in[6]);
     41   out[2] = _mm_add_epi32(in[2], in[5]);
     42   out[3] = _mm_add_epi32(in[3], in[4]);
     43   out[4] = _mm_sub_epi32(in[3], in[4]);
     44   out[5] = _mm_sub_epi32(in[2], in[5]);
     45   out[6] = _mm_sub_epi32(in[1], in[6]);
     46   out[7] = _mm_sub_epi32(in[0], in[7]);
     47   out[8] = in[8];
     48   out[9] = in[9];
     49   highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
     50   highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
     51   out[14] = in[14];
     52   out[15] = in[15];
     53 }
     54 
     55 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
     56   __m128i step1[16], step2[16];
     57 
     58   // stage 2
     59   highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
     60                         &step2[15]);
     61   highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
     62                         &step2[14]);
     63   highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
     64                         &step2[13]);
     65   highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
     66                         &step2[12]);
     67 
     68   // stage 3
     69   highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
     70                         &step1[7]);
     71   highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
     72                         &step1[6]);
     73   step1[8] = _mm_add_epi32(step2[8], step2[9]);
     74   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
     75   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
     76   step1[11] = _mm_add_epi32(step2[10], step2[11]);
     77   step1[12] = _mm_add_epi32(step2[13], step2[12]);
     78   step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
     79   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
     80   step1[15] = _mm_add_epi32(step2[15], step2[14]);
     81 
     82   // stage 4
     83   highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
     84   highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
     85                         &step2[3]);
     86   highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
     87                         &step2[14]);
     88   highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
     89                         &step2[13], &step2[10]);
     90   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
     91   step1[4] = _mm_add_epi32(step1[4], step1[5]);
     92   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
     93   step1[7] = _mm_add_epi32(step1[7], step1[6]);
     94   step2[8] = step1[8];
     95   step2[11] = step1[11];
     96   step2[12] = step1[12];
     97   step2[15] = step1[15];
     98 
     99   highbd_idct16_4col_stage5(step2, step1);
    100   highbd_idct16_4col_stage6(step1, step2);
    101   highbd_idct16_4col_stage7(step2, io);
    102 }
    103 
    104 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
    105   __m128i step1[16], step2[16];
    106   __m128i temp1[2], sign[2];
    107 
    108   // stage 2
    109   highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
    110                                 &step2[15]);
    111   highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
    112                                     &step2[14]);
    113   highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
    114                                 &step2[13]);
    115   highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
    116                                     &step2[12]);
    117 
    118   // stage 3
    119   highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
    120                                 &step1[7]);
    121   highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
    122                                     &step1[6]);
    123   step1[8] = _mm_add_epi32(step2[8], step2[9]);
    124   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
    125   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
    126   step1[11] = _mm_add_epi32(step2[10], step2[11]);
    127   step1[12] = _mm_add_epi32(step2[13], step2[12]);
    128   step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
    129   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
    130   step1[15] = _mm_add_epi32(step2[15], step2[14]);
    131 
    132   // stage 4
    133   abs_extend_64bit_sse2(io[0], temp1, sign);
    134   step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
    135   step2[1] = step2[0];
    136   highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
    137                                 &step2[3]);
    138   highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
    139                         &step2[14]);
    140   highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
    141                         &step2[13], &step2[10]);
    142   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
    143   step1[4] = _mm_add_epi32(step1[4], step1[5]);
    144   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
    145   step1[7] = _mm_add_epi32(step1[7], step1[6]);
    146   step2[8] = step1[8];
    147   step2[11] = step1[11];
    148   step2[12] = step1[12];
    149   step2[15] = step1[15];
    150 
    151   highbd_idct16_4col_stage5(step2, step1);
    152   highbd_idct16_4col_stage6(step1, step2);
    153   highbd_idct16_4col_stage7(step2, io);
    154 }
    155 
    156 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
    157   __m128i step1[16], step2[16];
    158   __m128i temp[2], sign[2];
    159 
    160   // stage 2
    161   highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
    162                                 &step2[15]);
    163   highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
    164                                     &step2[12]);
    165 
    166   // stage 3
    167   highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
    168                                 &step1[7]);
    169   step1[8] = step2[8];
    170   step1[9] = step2[8];
    171   step1[10] =
    172       _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
    173   step1[11] = step2[11];
    174   step1[12] = step2[12];
    175   step1[13] =
    176       _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
    177   step1[14] = step2[15];
    178   step1[15] = step2[15];
    179 
    180   // stage 4
    181   abs_extend_64bit_sse2(io[0], temp, sign);
    182   step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
    183   step2[1] = step2[0];
    184   step2[2] = _mm_setzero_si128();
    185   step2[3] = _mm_setzero_si128();
    186   highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
    187                         &step2[14]);
    188   highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
    189                         &step2[13], &step2[10]);
    190   step2[5] = step1[4];
    191   step2[6] = step1[7];
    192   step2[8] = step1[8];
    193   step2[11] = step1[11];
    194   step2[12] = step1[12];
    195   step2[15] = step1[15];
    196 
    197   highbd_idct16_4col_stage5(step2, step1);
    198   highbd_idct16_4col_stage6(step1, step2);
    199   highbd_idct16_4col_stage7(step2, io);
    200 }
    201 
    202 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
    203                                        int stride, int bd) {
    204   int i;
    205   __m128i out[16], *in;
    206 
    207   if (bd == 8) {
    208     __m128i l[16], r[16];
    209 
    210     in = l;
    211     for (i = 0; i < 2; i++) {
    212       highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
    213       highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
    214       idct16_8col(in, in);
    215       in = r;
    216       input += 128;
    217     }
    218 
    219     for (i = 0; i < 16; i += 8) {
    220       int j;
    221       transpose_16bit_8x8(l + i, out);
    222       transpose_16bit_8x8(r + i, out + 8);
    223       idct16_8col(out, out);
    224 
    225       for (j = 0; j < 16; ++j) {
    226         highbd_write_buffer_8(dest + j * stride, out[j], bd);
    227       }
    228       dest += 8;
    229     }
    230   } else {
    231     __m128i all[4][16];
    232 
    233     for (i = 0; i < 4; i++) {
    234       in = all[i];
    235       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
    236       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
    237       highbd_idct16_4col(in);
    238       input += 4 * 16;
    239     }
    240 
    241     for (i = 0; i < 16; i += 4) {
    242       int j;
    243       transpose_32bit_4x4(all[0] + i, out + 0);
    244       transpose_32bit_4x4(all[1] + i, out + 4);
    245       transpose_32bit_4x4(all[2] + i, out + 8);
    246       transpose_32bit_4x4(all[3] + i, out + 12);
    247       highbd_idct16_4col(out);
    248 
    249       for (j = 0; j < 16; ++j) {
    250         highbd_write_buffer_4(dest + j * stride, out[j], bd);
    251       }
    252       dest += 4;
    253     }
    254   }
    255 }
    256 
    257 void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
    258                                       int stride, int bd) {
    259   int i;
    260   __m128i out[16];
    261 
    262   if (bd == 8) {
    263     __m128i in[16], temp[16];
    264 
    265     highbd_load_pack_transpose_32bit_8x8(input, 16, in);
    266     for (i = 8; i < 16; i++) {
    267       in[i] = _mm_setzero_si128();
    268     }
    269     idct16_8col(in, temp);
    270 
    271     for (i = 0; i < 16; i += 8) {
    272       int j;
    273       transpose_16bit_8x8(temp + i, in);
    274       idct16_8col(in, out);
    275 
    276       for (j = 0; j < 16; ++j) {
    277         highbd_write_buffer_8(dest + j * stride, out[j], bd);
    278       }
    279       dest += 8;
    280     }
    281   } else {
    282     __m128i all[2][16], *in;
    283 
    284     for (i = 0; i < 2; i++) {
    285       in = all[i];
    286       highbd_load_transpose_32bit_8x4(input, 16, in);
    287       highbd_idct16x16_38_4col(in);
    288       input += 4 * 16;
    289     }
    290 
    291     for (i = 0; i < 16; i += 4) {
    292       int j;
    293       transpose_32bit_4x4(all[0] + i, out + 0);
    294       transpose_32bit_4x4(all[1] + i, out + 4);
    295       highbd_idct16x16_38_4col(out);
    296 
    297       for (j = 0; j < 16; ++j) {
    298         highbd_write_buffer_4(dest + j * stride, out[j], bd);
    299       }
    300       dest += 4;
    301     }
    302   }
    303 }
    304 
    305 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
    306                                       int stride, int bd) {
    307   int i;
    308   __m128i out[16];
    309 
    310   if (bd == 8) {
    311     __m128i in[16], l[16];
    312 
    313     in[0] = load_pack_8_32bit(input + 0 * 16);
    314     in[1] = load_pack_8_32bit(input + 1 * 16);
    315     in[2] = load_pack_8_32bit(input + 2 * 16);
    316     in[3] = load_pack_8_32bit(input + 3 * 16);
    317 
    318     idct16x16_10_pass1(in, l);
    319 
    320     for (i = 0; i < 16; i += 8) {
    321       int j;
    322       idct16x16_10_pass2(l + i, in);
    323 
    324       for (j = 0; j < 16; ++j) {
    325         highbd_write_buffer_8(dest + j * stride, in[j], bd);
    326       }
    327       dest += 8;
    328     }
    329   } else {
    330     __m128i all[2][16], *in;
    331 
    332     for (i = 0; i < 2; i++) {
    333       in = all[i];
    334       highbd_load_transpose_32bit_4x4(input, 16, in);
    335       highbd_idct16x16_10_4col(in);
    336       input += 4 * 16;
    337     }
    338 
    339     for (i = 0; i < 16; i += 4) {
    340       int j;
    341       transpose_32bit_4x4(&all[0][i], out);
    342       highbd_idct16x16_10_4col(out);
    343 
    344       for (j = 0; j < 16; ++j) {
    345         highbd_write_buffer_4(dest + j * stride, out[j], bd);
    346       }
    347       dest += 4;
    348     }
    349   }
    350 }
    351 
    352 void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
    353                                      int stride, int bd) {
    354   highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
    355 }
    356