Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <smmintrin.h>  // SSE4.1
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
     15 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
     16 #include "vpx_dsp/x86/inv_txfm_sse2.h"
     17 #include "vpx_dsp/x86/inv_txfm_ssse3.h"
     18 #include "vpx_dsp/x86/transpose_sse2.h"
     19 #include "vpx_dsp/x86/txfm_common_sse2.h"
     20 
     21 static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
     22     __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
     23   __m128i step2[32];
     24 
     25   // stage 4
     26   step2[8] = step1[8];
     27   step2[15] = step1[15];
     28   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
     29                           &step2[9], &step2[14]);
     30   highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
     31                           &step2[10], &step2[13]);
     32   step2[11] = step1[11];
     33   step2[12] = step1[12];
     34 
     35   // stage 5
     36   step1[8] = _mm_add_epi32(step2[8], step2[11]);
     37   step1[9] = _mm_add_epi32(step2[9], step2[10]);
     38   step1[10] = _mm_sub_epi32(step2[9], step2[10]);
     39   step1[11] = _mm_sub_epi32(step2[8], step2[11]);
     40   step1[12] = _mm_sub_epi32(step2[15], step2[12]);
     41   step1[13] = _mm_sub_epi32(step2[14], step2[13]);
     42   step1[14] = _mm_add_epi32(step2[14], step2[13]);
     43   step1[15] = _mm_add_epi32(step2[15], step2[12]);
     44 
     45   // stage 6
     46   out[8] = step1[8];
     47   out[9] = step1[9];
     48   highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
     49                           &out[10], &out[13]);
     50   highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
     51                           &out[11], &out[12]);
     52   out[14] = step1[14];
     53   out[15] = step1[15];
     54 }
     55 
     56 static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
     57     __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
     58   __m128i step2[32];
     59 
     60   // stage 4
     61   step2[16] = _mm_add_epi32(step1[16], step1[19]);
     62   step2[17] = _mm_add_epi32(step1[17], step1[18]);
     63   step2[18] = _mm_sub_epi32(step1[17], step1[18]);
     64   step2[19] = _mm_sub_epi32(step1[16], step1[19]);
     65   step2[20] = _mm_sub_epi32(step1[23], step1[20]);
     66   step2[21] = _mm_sub_epi32(step1[22], step1[21]);
     67   step2[22] = _mm_add_epi32(step1[22], step1[21]);
     68   step2[23] = _mm_add_epi32(step1[23], step1[20]);
     69 
     70   step2[24] = _mm_add_epi32(step1[24], step1[27]);
     71   step2[25] = _mm_add_epi32(step1[25], step1[26]);
     72   step2[26] = _mm_sub_epi32(step1[25], step1[26]);
     73   step2[27] = _mm_sub_epi32(step1[24], step1[27]);
     74   step2[28] = _mm_sub_epi32(step1[31], step1[28]);
     75   step2[29] = _mm_sub_epi32(step1[30], step1[29]);
     76   step2[30] = _mm_add_epi32(step1[29], step1[30]);
     77   step2[31] = _mm_add_epi32(step1[28], step1[31]);
     78 
     79   // stage 5
     80   step1[16] = step2[16];
     81   step1[17] = step2[17];
     82   highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
     83                           &step1[18], &step1[29]);
     84   highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
     85                           &step1[19], &step1[28]);
     86   highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
     87                           &step1[20], &step1[27]);
     88   highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
     89                           &step1[21], &step1[26]);
     90   step1[22] = step2[22];
     91   step1[23] = step2[23];
     92   step1[24] = step2[24];
     93   step1[25] = step2[25];
     94   step1[30] = step2[30];
     95   step1[31] = step2[31];
     96 
     97   // stage 6
     98   step2[16] = _mm_add_epi32(step1[16], step1[23]);
     99   step2[17] = _mm_add_epi32(step1[17], step1[22]);
    100   step2[18] = _mm_add_epi32(step1[18], step1[21]);
    101   step2[19] = _mm_add_epi32(step1[19], step1[20]);
    102   step2[20] = _mm_sub_epi32(step1[19], step1[20]);
    103   step2[21] = _mm_sub_epi32(step1[18], step1[21]);
    104   step2[22] = _mm_sub_epi32(step1[17], step1[22]);
    105   step2[23] = _mm_sub_epi32(step1[16], step1[23]);
    106 
    107   step2[24] = _mm_sub_epi32(step1[31], step1[24]);
    108   step2[25] = _mm_sub_epi32(step1[30], step1[25]);
    109   step2[26] = _mm_sub_epi32(step1[29], step1[26]);
    110   step2[27] = _mm_sub_epi32(step1[28], step1[27]);
    111   step2[28] = _mm_add_epi32(step1[27], step1[28]);
    112   step2[29] = _mm_add_epi32(step1[26], step1[29]);
    113   step2[30] = _mm_add_epi32(step1[25], step1[30]);
    114   step2[31] = _mm_add_epi32(step1[24], step1[31]);
    115 
    116   // stage 7
    117   out[16] = step2[16];
    118   out[17] = step2[17];
    119   out[18] = step2[18];
    120   out[19] = step2[19];
    121   highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
    122                           &out[20], &out[27]);
    123   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
    124                           &out[21], &out[26]);
    125   highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
    126                           &out[22], &out[25]);
    127   highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
    128                           &out[23], &out[24]);
    129   out[28] = step2[28];
    130   out[29] = step2[29];
    131   out[30] = step2[30];
    132   out[31] = step2[31];
    133 }
    134 
    135 // Group the coefficient calculation into smaller functions to prevent stack
    136 // spillover in 32x32 idct optimizations:
    137 // quarter_1: 0-7
    138 // quarter_2: 8-15
    139 // quarter_3_4: 16-23, 24-31
    140 
    141 // For each 4x32 block __m128i in[32],
    142 // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
    143 // output pixels: 0-7 in __m128i out[32]
    144 static INLINE void highbd_idct32_1024_4x32_quarter_1(
    145     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
    146   __m128i step1[8], step2[8];
    147 
    148   // stage 3
    149   highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
    150                           &step1[7]);
    151   highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
    152                           &step1[6]);
    153 
    154   // stage 4
    155   highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
    156                           &step2[0]);
    157   highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
    158                           &step2[3]);
    159   step2[4] = _mm_add_epi32(step1[4], step1[5]);
    160   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
    161   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
    162   step2[7] = _mm_add_epi32(step1[7], step1[6]);
    163 
    164   // stage 5
    165   step1[0] = _mm_add_epi32(step2[0], step2[3]);
    166   step1[1] = _mm_add_epi32(step2[1], step2[2]);
    167   step1[2] = _mm_sub_epi32(step2[1], step2[2]);
    168   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
    169   step1[4] = step2[4];
    170   highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
    171                           &step1[5], &step1[6]);
    172   step1[7] = step2[7];
    173 
    174   // stage 6
    175   out[0] = _mm_add_epi32(step1[0], step1[7]);
    176   out[1] = _mm_add_epi32(step1[1], step1[6]);
    177   out[2] = _mm_add_epi32(step1[2], step1[5]);
    178   out[3] = _mm_add_epi32(step1[3], step1[4]);
    179   out[4] = _mm_sub_epi32(step1[3], step1[4]);
    180   out[5] = _mm_sub_epi32(step1[2], step1[5]);
    181   out[6] = _mm_sub_epi32(step1[1], step1[6]);
    182   out[7] = _mm_sub_epi32(step1[0], step1[7]);
    183 }
    184 
    185 // For each 4x32 block __m128i in[32],
    186 // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
    187 // output pixels: 8-15 in __m128i out[32]
    188 static INLINE void highbd_idct32_1024_4x32_quarter_2(
    189     const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
    190   __m128i step1[32], step2[32];
    191 
    192   // stage 2
    193   highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
    194                           &step2[15]);
    195   highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
    196                           &step2[14]);
    197   highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
    198                           &step2[13]);
    199   highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
    200                           &step2[12]);
    201 
    202   // stage 3
    203   step1[8] = _mm_add_epi32(step2[8], step2[9]);
    204   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
    205   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
    206   step1[15] = _mm_add_epi32(step2[15], step2[14]);
    207   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
    208   step1[11] = _mm_add_epi32(step2[11], step2[10]);
    209   step1[12] = _mm_add_epi32(step2[12], step2[13]);
    210   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
    211 
    212   highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
    213 }
    214 
    215 static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
    216     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
    217   __m128i temp[16];
    218   highbd_idct32_1024_4x32_quarter_1(in, temp);
    219   highbd_idct32_1024_4x32_quarter_2(in, temp);
    220   // stage 7
    221   highbd_add_sub_butterfly(temp, out, 16);
    222 }
    223 
    224 // For each 4x32 block __m128i in[32],
    225 // Input with odd index,
    226 // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
    227 // output pixels: 16-23, 24-31 in __m128i out[32]
    228 static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
    229     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
    230   __m128i step1[32], step2[32];
    231 
    232   // stage 1
    233   highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
    234                           &step1[31]);
    235   highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
    236                           &step1[30]);
    237   highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
    238                           &step1[29]);
    239   highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
    240                           &step1[28]);
    241 
    242   highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
    243                           &step1[27]);
    244   highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
    245                           &step1[26]);
    246 
    247   highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
    248                           &step1[25]);
    249   highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
    250                           &step1[24]);
    251 
    252   // stage 2
    253   step2[16] = _mm_add_epi32(step1[16], step1[17]);
    254   step2[17] = _mm_sub_epi32(step1[16], step1[17]);
    255   step2[18] = _mm_sub_epi32(step1[19], step1[18]);
    256   step2[19] = _mm_add_epi32(step1[19], step1[18]);
    257   step2[20] = _mm_add_epi32(step1[20], step1[21]);
    258   step2[21] = _mm_sub_epi32(step1[20], step1[21]);
    259   step2[22] = _mm_sub_epi32(step1[23], step1[22]);
    260   step2[23] = _mm_add_epi32(step1[23], step1[22]);
    261 
    262   step2[24] = _mm_add_epi32(step1[24], step1[25]);
    263   step2[25] = _mm_sub_epi32(step1[24], step1[25]);
    264   step2[26] = _mm_sub_epi32(step1[27], step1[26]);
    265   step2[27] = _mm_add_epi32(step1[27], step1[26]);
    266   step2[28] = _mm_add_epi32(step1[28], step1[29]);
    267   step2[29] = _mm_sub_epi32(step1[28], step1[29]);
    268   step2[30] = _mm_sub_epi32(step1[31], step1[30]);
    269   step2[31] = _mm_add_epi32(step1[31], step1[30]);
    270 
    271   // stage 3
    272   step1[16] = step2[16];
    273   step1[31] = step2[31];
    274   highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
    275                           &step1[17], &step1[30]);
    276   highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
    277                           &step1[18], &step1[29]);
    278   step1[19] = step2[19];
    279   step1[20] = step2[20];
    280   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
    281                           &step1[21], &step1[26]);
    282   highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
    283                           &step1[22], &step1[25]);
    284   step1[23] = step2[23];
    285   step1[24] = step2[24];
    286   step1[27] = step2[27];
    287   step1[28] = step2[28];
    288 
    289   highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
    290 }
    291 
    292 static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
    293   __m128i temp[32];
    294 
    295   highbd_idct32_1024_4x32_quarter_1_2(io, temp);
    296   highbd_idct32_1024_4x32_quarter_3_4(io, temp);
    297   // final stage
    298   highbd_add_sub_butterfly(temp, io, 32);
    299 }
    300 
    301 void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
    302                                           uint16_t *dest, int stride, int bd) {
    303   int i, j;
    304 
    305   if (bd == 8) {
    306     __m128i col[4][32], io[32];
    307 
    308     // rows
    309     for (i = 0; i < 4; i++) {
    310       highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
    311       highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
    312       highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
    313       highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
    314       idct32_1024_8x32(io, col[i]);
    315       input += 32 << 3;
    316     }
    317 
    318     // columns
    319     for (i = 0; i < 32; i += 8) {
    320       // Transpose 32x8 block to 8x32 block
    321       transpose_16bit_8x8(col[0] + i, io);
    322       transpose_16bit_8x8(col[1] + i, io + 8);
    323       transpose_16bit_8x8(col[2] + i, io + 16);
    324       transpose_16bit_8x8(col[3] + i, io + 24);
    325       idct32_1024_8x32(io, io);
    326       for (j = 0; j < 32; ++j) {
    327         highbd_write_buffer_8(dest + j * stride, io[j], bd);
    328       }
    329       dest += 8;
    330     }
    331   } else {
    332     __m128i all[8][32], out[32], *in;
    333 
    334     for (i = 0; i < 8; i++) {
    335       in = all[i];
    336       highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
    337       highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
    338       highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
    339       highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
    340       highbd_idct32_1024_4x32(in);
    341       input += 4 * 32;
    342     }
    343 
    344     for (i = 0; i < 32; i += 4) {
    345       transpose_32bit_4x4(all[0] + i, out + 0);
    346       transpose_32bit_4x4(all[1] + i, out + 4);
    347       transpose_32bit_4x4(all[2] + i, out + 8);
    348       transpose_32bit_4x4(all[3] + i, out + 12);
    349       transpose_32bit_4x4(all[4] + i, out + 16);
    350       transpose_32bit_4x4(all[5] + i, out + 20);
    351       transpose_32bit_4x4(all[6] + i, out + 24);
    352       transpose_32bit_4x4(all[7] + i, out + 28);
    353       highbd_idct32_1024_4x32(out);
    354 
    355       for (j = 0; j < 32; ++j) {
    356         highbd_write_buffer_4(dest + j * stride, out[j], bd);
    357       }
    358       dest += 4;
    359     }
    360   }
    361 }
    362 
    363 // -----------------------------------------------------------------------------
    364 
    365 // For each 4x32 block __m128i in[32],
    366 // Input with index, 0, 4, 8, 12
    367 // output pixels: 0-7 in __m128i out[32]
    368 static INLINE void highbd_idct32_135_4x32_quarter_1(
    369     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
    370   __m128i step1[8], step2[8];
    371 
    372   // stage 3
    373   highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
    374                                   &step1[7]);
    375   highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
    376                                   &step1[6]);
    377 
    378   // stage 4
    379   highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
    380                                   &step2[0]);
    381   highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
    382                                   &step2[3]);
    383   step2[4] = _mm_add_epi32(step1[4], step1[5]);
    384   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
    385   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
    386   step2[7] = _mm_add_epi32(step1[7], step1[6]);
    387 
    388   // stage 5
    389   step1[0] = _mm_add_epi32(step2[0], step2[3]);
    390   step1[1] = _mm_add_epi32(step2[1], step2[2]);
    391   step1[2] = _mm_sub_epi32(step2[1], step2[2]);
    392   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
    393   step1[4] = step2[4];
    394   highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
    395                           &step1[5], &step1[6]);
    396   step1[7] = step2[7];
    397 
    398   // stage 6
    399   out[0] = _mm_add_epi32(step1[0], step1[7]);
    400   out[1] = _mm_add_epi32(step1[1], step1[6]);
    401   out[2] = _mm_add_epi32(step1[2], step1[5]);
    402   out[3] = _mm_add_epi32(step1[3], step1[4]);
    403   out[4] = _mm_sub_epi32(step1[3], step1[4]);
    404   out[5] = _mm_sub_epi32(step1[2], step1[5]);
    405   out[6] = _mm_sub_epi32(step1[1], step1[6]);
    406   out[7] = _mm_sub_epi32(step1[0], step1[7]);
    407 }
    408 
    409 // For each 4x32 block __m128i in[32],
    410 // Input with index, 2, 6, 10, 14
    411 // output pixels: 8-15 in __m128i out[32]
    412 static INLINE void highbd_idct32_135_4x32_quarter_2(
    413     const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
    414   __m128i step1[32], step2[32];
    415 
    416   // stage 2
    417   highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
    418                                   &step2[15]);
    419   highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
    420                                   &step2[14]);
    421   highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
    422                                   &step2[13]);
    423   highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
    424                                   &step2[12]);
    425 
    426   // stage 3
    427   step1[8] = _mm_add_epi32(step2[8], step2[9]);
    428   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
    429   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
    430   step1[15] = _mm_add_epi32(step2[15], step2[14]);
    431   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
    432   step1[11] = _mm_add_epi32(step2[11], step2[10]);
    433   step1[12] = _mm_add_epi32(step2[12], step2[13]);
    434   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
    435 
    436   highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
    437 }
    438 
    439 static INLINE void highbd_idct32_135_4x32_quarter_1_2(
    440     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
    441   __m128i temp[16];
    442   highbd_idct32_135_4x32_quarter_1(in, temp);
    443   highbd_idct32_135_4x32_quarter_2(in, temp);
    444   // stage 7
    445   highbd_add_sub_butterfly(temp, out, 16);
    446 }
    447 
    448 // For each 4x32 block __m128i in[32],
    449 // Input with odd index,
    450 // 1, 3, 5, 7, 9, 11, 13, 15
    451 // output pixels: 16-23, 24-31 in __m128i out[32]
    452 static INLINE void highbd_idct32_135_4x32_quarter_3_4(
    453     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
    454   __m128i step1[32], step2[32];
    455 
    456   // stage 1
    457   highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
    458                                   &step1[31]);
    459   highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
    460                                   &step1[30]);
    461   highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
    462                                   &step1[29]);
    463   highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
    464                                   &step1[28]);
    465 
    466   highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
    467                                   &step1[27]);
    468   highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
    469                                   &step1[26]);
    470 
    471   highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
    472                                   &step1[25]);
    473   highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
    474                                   &step1[24]);
    475 
    476   // stage 2
    477   step2[16] = _mm_add_epi32(step1[16], step1[17]);
    478   step2[17] = _mm_sub_epi32(step1[16], step1[17]);
    479   step2[18] = _mm_sub_epi32(step1[19], step1[18]);
    480   step2[19] = _mm_add_epi32(step1[19], step1[18]);
    481   step2[20] = _mm_add_epi32(step1[20], step1[21]);
    482   step2[21] = _mm_sub_epi32(step1[20], step1[21]);
    483   step2[22] = _mm_sub_epi32(step1[23], step1[22]);
    484   step2[23] = _mm_add_epi32(step1[23], step1[22]);
    485 
    486   step2[24] = _mm_add_epi32(step1[24], step1[25]);
    487   step2[25] = _mm_sub_epi32(step1[24], step1[25]);
    488   step2[26] = _mm_sub_epi32(step1[27], step1[26]);
    489   step2[27] = _mm_add_epi32(step1[27], step1[26]);
    490   step2[28] = _mm_add_epi32(step1[28], step1[29]);
    491   step2[29] = _mm_sub_epi32(step1[28], step1[29]);
    492   step2[30] = _mm_sub_epi32(step1[31], step1[30]);
    493   step2[31] = _mm_add_epi32(step1[31], step1[30]);
    494 
    495   // stage 3
    496   step1[16] = step2[16];
    497   step1[31] = step2[31];
    498   highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
    499                           &step1[17], &step1[30]);
    500   highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
    501                           &step1[18], &step1[29]);
    502   step1[19] = step2[19];
    503   step1[20] = step2[20];
    504   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
    505                           &step1[21], &step1[26]);
    506   highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
    507                           &step1[22], &step1[25]);
    508   step1[23] = step2[23];
    509   step1[24] = step2[24];
    510   step1[27] = step2[27];
    511   step1[28] = step2[28];
    512 
    513   highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
    514 }
    515 
    516 static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
    517   __m128i temp[32];
    518 
    519   highbd_idct32_135_4x32_quarter_1_2(io, temp);
    520   highbd_idct32_135_4x32_quarter_3_4(io, temp);
    521   // final stage
    522   highbd_add_sub_butterfly(temp, io, 32);
    523 }
    524 
    525 void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
    526                                          uint16_t *dest, int stride, int bd) {
    527   int i, j;
    528 
    529   if (bd == 8) {
    530     __m128i col[2][32], in[32], out[32];
    531 
    532     // rows
    533     for (i = 0; i < 2; i++) {
    534       highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
    535       highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
    536       idct32_135_8x32_ssse3(in, col[i]);
    537       input += 32 << 3;
    538     }
    539 
    540     // columns
    541     for (i = 0; i < 32; i += 8) {
    542       transpose_16bit_8x8(col[0] + i, in);
    543       transpose_16bit_8x8(col[1] + i, in + 8);
    544       idct32_135_8x32_ssse3(in, out);
    545       for (j = 0; j < 32; ++j) {
    546         highbd_write_buffer_8(dest + j * stride, out[j], bd);
    547       }
    548       dest += 8;
    549     }
    550   } else {
    551     __m128i all[8][32], out[32], *in;
    552 
    553     for (i = 0; i < 4; i++) {
    554       in = all[i];
    555       highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
    556       highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
    557       highbd_idct32_135_4x32(in);
    558       input += 4 * 32;
    559     }
    560 
    561     for (i = 0; i < 32; i += 4) {
    562       transpose_32bit_4x4(all[0] + i, out + 0);
    563       transpose_32bit_4x4(all[1] + i, out + 4);
    564       transpose_32bit_4x4(all[2] + i, out + 8);
    565       transpose_32bit_4x4(all[3] + i, out + 12);
    566       highbd_idct32_135_4x32(out);
    567 
    568       for (j = 0; j < 32; ++j) {
    569         highbd_write_buffer_4(dest + j * stride, out[j], bd);
    570       }
    571       dest += 4;
    572     }
    573   }
    574 }
    575 
    576 // -----------------------------------------------------------------------------
    577 
    578 // For each 4x32 block __m128i in[32],
    579 // Input with index, 0, 4
    580 // output pixels: 0-7 in __m128i out[32]
    581 static INLINE void highbd_idct32_34_4x32_quarter_1(
    582     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
    583   __m128i step1[8], step2[8];
    584 
    585   // stage 3
    586   highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
    587                                   &step1[7]);
    588 
    589   // stage 4
    590   highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
    591                                   &step2[0]);
    592   step2[4] = step1[4];
    593   step2[5] = step1[4];
    594   step2[6] = step1[7];
    595   step2[7] = step1[7];
    596 
    597   // stage 5
    598   step1[0] = step2[0];
    599   step1[1] = step2[1];
    600   step1[2] = step2[1];
    601   step1[3] = step2[0];
    602   step1[4] = step2[4];
    603   highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
    604                           &step1[5], &step1[6]);
    605   step1[7] = step2[7];
    606 
    607   // stage 6
    608   out[0] = _mm_add_epi32(step1[0], step1[7]);
    609   out[1] = _mm_add_epi32(step1[1], step1[6]);
    610   out[2] = _mm_add_epi32(step1[2], step1[5]);
    611   out[3] = _mm_add_epi32(step1[3], step1[4]);
    612   out[4] = _mm_sub_epi32(step1[3], step1[4]);
    613   out[5] = _mm_sub_epi32(step1[2], step1[5]);
    614   out[6] = _mm_sub_epi32(step1[1], step1[6]);
    615   out[7] = _mm_sub_epi32(step1[0], step1[7]);
    616 }
    617 
    618 // For each 4x32 block __m128i in[32],
    619 // Input with index, 2, 6
    620 // output pixels: 8-15 in __m128i out[32]
    621 static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
    622                                                    __m128i *out /*out[16]*/) {
    623   __m128i step1[32], step2[32];
    624 
    625   // stage 2
    626   highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
    627                                   &step2[15]);
    628   highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
    629                                   &step2[12]);
    630 
    631   // stage 3
    632   step1[8] = step2[8];
    633   step1[9] = step2[8];
    634   step1[14] = step2[15];
    635   step1[15] = step2[15];
    636   step1[10] = step2[11];
    637   step1[11] = step2[11];
    638   step1[12] = step2[12];
    639   step1[13] = step2[12];
    640 
    641   highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
    642 }
    643 
    644 static INLINE void highbd_idct32_34_4x32_quarter_1_2(
    645     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
    646   __m128i temp[16];
    647   highbd_idct32_34_4x32_quarter_1(in, temp);
    648   highbd_idct32_34_4x32_quarter_2(in, temp);
    649   // stage 7
    650   highbd_add_sub_butterfly(temp, out, 16);
    651 }
    652 
    653 // For each 4x32 block __m128i in[32],
    654 // Input with odd index,
    655 // 1, 3, 5, 7
    656 // output pixels: 16-23, 24-31 in __m128i out[32]
    657 static INLINE void highbd_idct32_34_4x32_quarter_3_4(
    658     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
    659   __m128i step1[32], step2[32];
    660 
    661   // stage 1
    662   highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
    663                                   &step1[31]);
    664   highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
    665                                   &step1[28]);
    666 
    667   highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
    668                                   &step1[27]);
    669   highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
    670                                   &step1[24]);
    671 
    672   // stage 2
    673   step2[16] = step1[16];
    674   step2[17] = step1[16];
    675   step2[18] = step1[19];
    676   step2[19] = step1[19];
    677   step2[20] = step1[20];
    678   step2[21] = step1[20];
    679   step2[22] = step1[23];
    680   step2[23] = step1[23];
    681 
    682   step2[24] = step1[24];
    683   step2[25] = step1[24];
    684   step2[26] = step1[27];
    685   step2[27] = step1[27];
    686   step2[28] = step1[28];
    687   step2[29] = step1[28];
    688   step2[30] = step1[31];
    689   step2[31] = step1[31];
    690 
    691   // stage 3
    692   step1[16] = step2[16];
    693   step1[31] = step2[31];
    694   highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
    695                           &step1[17], &step1[30]);
    696   highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
    697                           &step1[18], &step1[29]);
    698   step1[19] = step2[19];
    699   step1[20] = step2[20];
    700   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
    701                           &step1[21], &step1[26]);
    702   highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
    703                           &step1[22], &step1[25]);
    704   step1[23] = step2[23];
    705   step1[24] = step2[24];
    706   step1[27] = step2[27];
    707   step1[28] = step2[28];
    708 
    709   highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
    710 }
    711 
    712 static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
    713   __m128i temp[32];
    714 
    715   highbd_idct32_34_4x32_quarter_1_2(io, temp);
    716   highbd_idct32_34_4x32_quarter_3_4(io, temp);
    717   // final stage
    718   highbd_add_sub_butterfly(temp, io, 32);
    719 }
    720 
    721 void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
    722                                         int stride, int bd) {
    723   int i, j;
    724 
    725   if (bd == 8) {
    726     __m128i col[32], in[32], out[32];
    727 
    728     // rows
    729     highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
    730     idct32_34_8x32_ssse3(in, col);
    731 
    732     // columns
    733     for (i = 0; i < 32; i += 8) {
    734       transpose_16bit_8x8(col + i, in);
    735       idct32_34_8x32_ssse3(in, out);
    736       for (j = 0; j < 32; ++j) {
    737         highbd_write_buffer_8(dest + j * stride, out[j], bd);
    738       }
    739       dest += 8;
    740     }
    741   } else {
    742     __m128i all[8][32], out[32], *in;
    743 
    744     for (i = 0; i < 4; i++) {
    745       in = all[i];
    746       highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
    747       highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
    748       highbd_idct32_34_4x32(in);
    749       input += 4 * 32;
    750     }
    751 
    752     for (i = 0; i < 32; i += 4) {
    753       transpose_32bit_4x4(all[0] + i, out + 0);
    754       transpose_32bit_4x4(all[1] + i, out + 4);
    755       transpose_32bit_4x4(all[2] + i, out + 8);
    756       transpose_32bit_4x4(all[3] + i, out + 12);
    757       highbd_idct32_34_4x32(out);
    758 
    759       for (j = 0; j < 32; ++j) {
    760         highbd_write_buffer_4(dest + j * stride, out[j], bd);
    761       }
    762       dest += 4;
    763     }
    764   }
    765 }
    766