Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
     13 #include "vpx_dsp/x86/transpose_sse2.h"
     14 #include "vpx_dsp/x86/txfm_common_sse2.h"
     15 
     16 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
     17                              int stride) {
     18   const __m128i eight = _mm_set1_epi16(8);
     19   __m128i in[2];
     20 
     21   // Rows
     22   in[0] = load_input_data(input);
     23   in[1] = load_input_data(input + 8);
     24   idct4_sse2(in);
     25 
     26   // Columns
     27   idct4_sse2(in);
     28 
     29   // Final round and shift
     30   in[0] = _mm_add_epi16(in[0], eight);
     31   in[1] = _mm_add_epi16(in[1], eight);
     32   in[0] = _mm_srai_epi16(in[0], 4);
     33   in[1] = _mm_srai_epi16(in[1], 4);
     34 
     35   recon_and_store4x4_sse2(in, dest, stride);
     36 }
     37 
     38 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
     39                             int stride) {
     40   const __m128i zero = _mm_setzero_si128();
     41   int a;
     42   __m128i dc_value, d[2];
     43 
     44   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
     45   a = (int)dct_const_round_shift(a * cospi_16_64);
     46   a = ROUND_POWER_OF_TWO(a, 4);
     47 
     48   dc_value = _mm_set1_epi16(a);
     49 
     50   // Reconstruction and Store
     51   d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
     52   d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
     53   d[0] = _mm_unpacklo_epi32(d[0],
     54                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
     55   d[1] = _mm_unpacklo_epi32(
     56       _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
     57   d[0] = _mm_unpacklo_epi8(d[0], zero);
     58   d[1] = _mm_unpacklo_epi8(d[1], zero);
     59   d[0] = _mm_add_epi16(d[0], dc_value);
     60   d[1] = _mm_add_epi16(d[1], dc_value);
     61   d[0] = _mm_packus_epi16(d[0], d[1]);
     62 
     63   *(int *)dest = _mm_cvtsi128_si32(d[0]);
     64   d[0] = _mm_srli_si128(d[0], 4);
     65   *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
     66   d[0] = _mm_srli_si128(d[0], 4);
     67   *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
     68   d[0] = _mm_srli_si128(d[0], 4);
     69   *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
     70 }
     71 
     72 void idct4_sse2(__m128i *in) {
     73   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
     74   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     75   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     76   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
     77   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     78   __m128i u[8], v[8];
     79 
     80   transpose_16bit_4x4(in);
     81   // stage 1
     82   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
     83   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
     84   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
     85   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
     86   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
     87   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
     88 
     89   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
     90   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
     91   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
     92   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
     93 
     94   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
     95   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
     96   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
     97   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
     98 
     99   u[0] = _mm_packs_epi32(v[0], v[1]);
    100   u[1] = _mm_packs_epi32(v[3], v[2]);
    101 
    102   // stage 2
    103   in[0] = _mm_add_epi16(u[0], u[1]);
    104   in[1] = _mm_sub_epi16(u[0], u[1]);
    105   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
    106 }
    107 
    108 void iadst4_sse2(__m128i *in) {
    109   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
    110   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
    111   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
    112   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
    113   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
    114   const __m128i kZero = _mm_set1_epi16(0);
    115   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    116   __m128i u[8], v[8], in7;
    117 
    118   transpose_16bit_4x4(in);
    119   in7 = _mm_srli_si128(in[1], 8);
    120   in7 = _mm_add_epi16(in7, in[0]);
    121   in7 = _mm_sub_epi16(in7, in[1]);
    122 
    123   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
    124   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
    125   u[2] = _mm_unpacklo_epi16(in7, kZero);
    126   u[3] = _mm_unpackhi_epi16(in[0], kZero);
    127 
    128   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
    129   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
    130   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
    131   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
    132   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
    133   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
    134 
    135   u[0] = _mm_add_epi32(v[0], v[1]);
    136   u[1] = _mm_add_epi32(v[3], v[4]);
    137   u[2] = v[2];
    138   u[3] = _mm_add_epi32(u[0], u[1]);
    139   u[4] = _mm_slli_epi32(v[5], 2);
    140   u[5] = _mm_add_epi32(u[3], v[5]);
    141   u[6] = _mm_sub_epi32(u[5], u[4]);
    142 
    143   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    144   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    145   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    146   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    147 
    148   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    149   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    150   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    151   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    152 
    153   in[0] = _mm_packs_epi32(u[0], u[1]);
    154   in[1] = _mm_packs_epi32(u[2], u[3]);
    155 }
    156 
    157 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
    158   {                                                                  \
    159     tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
    160     tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
    161     tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
    162     tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
    163                                                                      \
    164     tmp0 = _mm_add_epi32(tmp0, rounding);                            \
    165     tmp1 = _mm_add_epi32(tmp1, rounding);                            \
    166     tmp2 = _mm_add_epi32(tmp2, rounding);                            \
    167     tmp3 = _mm_add_epi32(tmp3, rounding);                            \
    168                                                                      \
    169     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
    170     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
    171     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
    172     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
    173                                                                      \
    174     res0 = _mm_packs_epi32(tmp0, tmp1);                              \
    175     res1 = _mm_packs_epi32(tmp2, tmp3);                              \
    176   }
    177 
    178 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
    179               out4, out5, out6, out7)                                         \
    180   {                                                                           \
    181     /* Stage1 */                                                              \
    182     {                                                                         \
    183       const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
    184       const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
    185       const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
    186       const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
    187                                                                               \
    188       MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
    189                              stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
    190     }                                                                         \
    191                                                                               \
    192     /* Stage2 */                                                              \
    193     {                                                                         \
    194       const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
    195       const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
    196       const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
    197       const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
    198                                                                               \
    199       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
    200                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
    201                                                                               \
    202       stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
    203       stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
    204       stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
    205       stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
    206     }                                                                         \
    207                                                                               \
    208     /* Stage3 */                                                              \
    209     {                                                                         \
    210       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
    211       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
    212                                                                               \
    213       stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
    214       stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
    215       stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
    216       stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
    217                                                                               \
    218       tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
    219       tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
    220       tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
    221       tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
    222                                                                               \
    223       tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
    224       tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
    225       tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
    226       tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
    227                                                                               \
    228       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
    229       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
    230       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
    231       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
    232                                                                               \
    233       stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
    234       stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
    235     }                                                                         \
    236                                                                               \
    237     /* Stage4  */                                                             \
    238     out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
    239     out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
    240     out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
    241     out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
    242     out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
    243     out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
    244     out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
    245     out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
    246   }
    247 
    248 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
    249                              int stride) {
    250   const __m128i zero = _mm_setzero_si128();
    251   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    252   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
    253   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    254   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    255   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    256   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
    257   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    258   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    259   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    260   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    261 
    262   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    263   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    264   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
    265   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    266   int i;
    267 
    268   // Load input data.
    269   in0 = load_input_data(input);
    270   in1 = load_input_data(input + 8 * 1);
    271   in2 = load_input_data(input + 8 * 2);
    272   in3 = load_input_data(input + 8 * 3);
    273   in4 = load_input_data(input + 8 * 4);
    274   in5 = load_input_data(input + 8 * 5);
    275   in6 = load_input_data(input + 8 * 6);
    276   in7 = load_input_data(input + 8 * 7);
    277 
    278   // 2-D
    279   for (i = 0; i < 2; i++) {
    280     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
    281     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
    282                   in4, in5, in6, in7);
    283 
    284     // 4-stage 1D idct8x8
    285     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
    286           in6, in7);
    287   }
    288 
    289   // Final rounding and shift
    290   in0 = _mm_adds_epi16(in0, final_rounding);
    291   in1 = _mm_adds_epi16(in1, final_rounding);
    292   in2 = _mm_adds_epi16(in2, final_rounding);
    293   in3 = _mm_adds_epi16(in3, final_rounding);
    294   in4 = _mm_adds_epi16(in4, final_rounding);
    295   in5 = _mm_adds_epi16(in5, final_rounding);
    296   in6 = _mm_adds_epi16(in6, final_rounding);
    297   in7 = _mm_adds_epi16(in7, final_rounding);
    298 
    299   in0 = _mm_srai_epi16(in0, 5);
    300   in1 = _mm_srai_epi16(in1, 5);
    301   in2 = _mm_srai_epi16(in2, 5);
    302   in3 = _mm_srai_epi16(in3, 5);
    303   in4 = _mm_srai_epi16(in4, 5);
    304   in5 = _mm_srai_epi16(in5, 5);
    305   in6 = _mm_srai_epi16(in6, 5);
    306   in7 = _mm_srai_epi16(in7, 5);
    307 
    308   RECON_AND_STORE(dest + 0 * stride, in0);
    309   RECON_AND_STORE(dest + 1 * stride, in1);
    310   RECON_AND_STORE(dest + 2 * stride, in2);
    311   RECON_AND_STORE(dest + 3 * stride, in3);
    312   RECON_AND_STORE(dest + 4 * stride, in4);
    313   RECON_AND_STORE(dest + 5 * stride, in5);
    314   RECON_AND_STORE(dest + 6 * stride, in6);
    315   RECON_AND_STORE(dest + 7 * stride, in7);
    316 }
    317 
    318 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
    319                             int stride) {
    320   __m128i dc_value;
    321   const __m128i zero = _mm_setzero_si128();
    322   int a;
    323 
    324   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
    325   a = (int)dct_const_round_shift(a * cospi_16_64);
    326   a = ROUND_POWER_OF_TWO(a, 5);
    327 
    328   dc_value = _mm_set1_epi16(a);
    329 
    330   RECON_AND_STORE(dest + 0 * stride, dc_value);
    331   RECON_AND_STORE(dest + 1 * stride, dc_value);
    332   RECON_AND_STORE(dest + 2 * stride, dc_value);
    333   RECON_AND_STORE(dest + 3 * stride, dc_value);
    334   RECON_AND_STORE(dest + 4 * stride, dc_value);
    335   RECON_AND_STORE(dest + 5 * stride, dc_value);
    336   RECON_AND_STORE(dest + 6 * stride, dc_value);
    337   RECON_AND_STORE(dest + 7 * stride, dc_value);
    338 }
    339 
    340 void idct8_sse2(__m128i *in) {
    341   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    342   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    343   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    344   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    345   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
    346   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    347   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    348   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    349   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    350 
    351   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    352   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    353   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
    354   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    355 
    356   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
    357   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
    358                 in1, in2, in3, in4, in5, in6, in7);
    359 
    360   // 4-stage 1D idct8x8
    361   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
    362         in[4], in[5], in[6], in[7]);
    363 }
    364 
    365 void iadst8_sse2(__m128i *in) {
    366   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
    367   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    368   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
    369   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    370   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
    371   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    372   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
    373   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    374   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    375   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    376   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    377   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    378   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    379   const __m128i k__const_0 = _mm_set1_epi16(0);
    380   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    381 
    382   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
    383   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    384   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
    385   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    386   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    387 
    388   // transpose
    389   array_transpose_8x8(in, in);
    390 
    391   // properly aligned for butterfly input
    392   in0 = in[7];
    393   in1 = in[0];
    394   in2 = in[5];
    395   in3 = in[2];
    396   in4 = in[3];
    397   in5 = in[4];
    398   in6 = in[1];
    399   in7 = in[6];
    400 
    401   // column transformation
    402   // stage 1
    403   // interleave and multiply/add into 32-bit integer
    404   s0 = _mm_unpacklo_epi16(in0, in1);
    405   s1 = _mm_unpackhi_epi16(in0, in1);
    406   s2 = _mm_unpacklo_epi16(in2, in3);
    407   s3 = _mm_unpackhi_epi16(in2, in3);
    408   s4 = _mm_unpacklo_epi16(in4, in5);
    409   s5 = _mm_unpackhi_epi16(in4, in5);
    410   s6 = _mm_unpacklo_epi16(in6, in7);
    411   s7 = _mm_unpackhi_epi16(in6, in7);
    412 
    413   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
    414   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
    415   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
    416   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
    417   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
    418   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
    419   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
    420   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
    421   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
    422   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
    423   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
    424   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
    425   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
    426   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
    427   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
    428   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
    429 
    430   // addition
    431   w0 = _mm_add_epi32(u0, u8);
    432   w1 = _mm_add_epi32(u1, u9);
    433   w2 = _mm_add_epi32(u2, u10);
    434   w3 = _mm_add_epi32(u3, u11);
    435   w4 = _mm_add_epi32(u4, u12);
    436   w5 = _mm_add_epi32(u5, u13);
    437   w6 = _mm_add_epi32(u6, u14);
    438   w7 = _mm_add_epi32(u7, u15);
    439   w8 = _mm_sub_epi32(u0, u8);
    440   w9 = _mm_sub_epi32(u1, u9);
    441   w10 = _mm_sub_epi32(u2, u10);
    442   w11 = _mm_sub_epi32(u3, u11);
    443   w12 = _mm_sub_epi32(u4, u12);
    444   w13 = _mm_sub_epi32(u5, u13);
    445   w14 = _mm_sub_epi32(u6, u14);
    446   w15 = _mm_sub_epi32(u7, u15);
    447 
    448   // shift and rounding
    449   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    450   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    451   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    452   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    453   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    454   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    455   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    456   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    457   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
    458   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
    459   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
    460   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
    461   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
    462   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
    463   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
    464   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
    465 
    466   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    467   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    468   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    469   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    470   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    471   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    472   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    473   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    474   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
    475   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
    476   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
    477   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
    478   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
    479   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
    480   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
    481   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
    482 
    483   // back to 16-bit and pack 8 integers into __m128i
    484   in[0] = _mm_packs_epi32(u0, u1);
    485   in[1] = _mm_packs_epi32(u2, u3);
    486   in[2] = _mm_packs_epi32(u4, u5);
    487   in[3] = _mm_packs_epi32(u6, u7);
    488   in[4] = _mm_packs_epi32(u8, u9);
    489   in[5] = _mm_packs_epi32(u10, u11);
    490   in[6] = _mm_packs_epi32(u12, u13);
    491   in[7] = _mm_packs_epi32(u14, u15);
    492 
    493   // stage 2
    494   s0 = _mm_add_epi16(in[0], in[2]);
    495   s1 = _mm_add_epi16(in[1], in[3]);
    496   s2 = _mm_sub_epi16(in[0], in[2]);
    497   s3 = _mm_sub_epi16(in[1], in[3]);
    498   u0 = _mm_unpacklo_epi16(in[4], in[5]);
    499   u1 = _mm_unpackhi_epi16(in[4], in[5]);
    500   u2 = _mm_unpacklo_epi16(in[6], in[7]);
    501   u3 = _mm_unpackhi_epi16(in[6], in[7]);
    502 
    503   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
    504   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
    505   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
    506   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
    507   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
    508   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
    509   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
    510   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
    511 
    512   w0 = _mm_add_epi32(v0, v4);
    513   w1 = _mm_add_epi32(v1, v5);
    514   w2 = _mm_add_epi32(v2, v6);
    515   w3 = _mm_add_epi32(v3, v7);
    516   w4 = _mm_sub_epi32(v0, v4);
    517   w5 = _mm_sub_epi32(v1, v5);
    518   w6 = _mm_sub_epi32(v2, v6);
    519   w7 = _mm_sub_epi32(v3, v7);
    520 
    521   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    522   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    523   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    524   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    525   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    526   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    527   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    528   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    529 
    530   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    531   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    532   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    533   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    534   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    535   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    536   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    537   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    538 
    539   // back to 16-bit intergers
    540   s4 = _mm_packs_epi32(u0, u1);
    541   s5 = _mm_packs_epi32(u2, u3);
    542   s6 = _mm_packs_epi32(u4, u5);
    543   s7 = _mm_packs_epi32(u6, u7);
    544 
    545   // stage 3
    546   u0 = _mm_unpacklo_epi16(s2, s3);
    547   u1 = _mm_unpackhi_epi16(s2, s3);
    548   u2 = _mm_unpacklo_epi16(s6, s7);
    549   u3 = _mm_unpackhi_epi16(s6, s7);
    550 
    551   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    552   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    553   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    554   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    555   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
    556   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
    557   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
    558   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
    559 
    560   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    561   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    562   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    563   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    564   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    565   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    566   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    567   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    568 
    569   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    570   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    571   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    572   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    573   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    574   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    575   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    576   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    577 
    578   s2 = _mm_packs_epi32(v0, v1);
    579   s3 = _mm_packs_epi32(v2, v3);
    580   s6 = _mm_packs_epi32(v4, v5);
    581   s7 = _mm_packs_epi32(v6, v7);
    582 
    583   in[0] = s0;
    584   in[1] = _mm_sub_epi16(k__const_0, s4);
    585   in[2] = s6;
    586   in[3] = _mm_sub_epi16(k__const_0, s2);
    587   in[4] = s3;
    588   in[5] = _mm_sub_epi16(k__const_0, s7);
    589   in[6] = s5;
    590   in[7] = _mm_sub_epi16(k__const_0, s1);
    591 }
    592 
    593 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
    594                              int stride) {
    595   const __m128i zero = _mm_setzero_si128();
    596   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    597   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
    598   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    599   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    600   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    601   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
    602   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    603   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    604   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    605   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    606   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    607 
    608   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    609   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    610   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
    611   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    612 
    613   // Rows. Load 4-row input data.
    614   in0 = load_input_data(input);
    615   in1 = load_input_data(input + 8 * 1);
    616   in2 = load_input_data(input + 8 * 2);
    617   in3 = load_input_data(input + 8 * 3);
    618 
    619   // 8x4 Transpose
    620   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
    621   // Stage1
    622   {
    623     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
    624     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
    625 
    626     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
    627     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
    628     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
    629     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
    630 
    631     tmp0 = _mm_add_epi32(tmp0, rounding);
    632     tmp2 = _mm_add_epi32(tmp2, rounding);
    633     tmp4 = _mm_add_epi32(tmp4, rounding);
    634     tmp6 = _mm_add_epi32(tmp6, rounding);
    635     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    636     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    637     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
    638     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
    639 
    640     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
    641     stp1_5 = _mm_packs_epi32(tmp4, tmp6);
    642   }
    643 
    644   // Stage2
    645   {
    646     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
    647     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
    648 
    649     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
    650     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
    651     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
    652     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
    653 
    654     tmp0 = _mm_add_epi32(tmp0, rounding);
    655     tmp2 = _mm_add_epi32(tmp2, rounding);
    656     tmp4 = _mm_add_epi32(tmp4, rounding);
    657     tmp6 = _mm_add_epi32(tmp6, rounding);
    658     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    659     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    660     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
    661     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
    662 
    663     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
    664     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
    665 
    666     tmp0 = _mm_add_epi16(stp1_4, stp1_5);
    667     tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
    668 
    669     stp2_4 = tmp0;
    670     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
    671     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
    672   }
    673 
    674   // Stage3
    675   {
    676     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
    677 
    678     tmp4 = _mm_add_epi16(stp2_0, stp2_2);
    679     tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
    680 
    681     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
    682     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
    683 
    684     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
    685     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
    686 
    687     tmp0 = _mm_add_epi32(tmp0, rounding);
    688     tmp2 = _mm_add_epi32(tmp2, rounding);
    689     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    690     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    691 
    692     stp1_5 = _mm_packs_epi32(tmp0, tmp2);
    693   }
    694 
    695   // Stage4
    696   tmp0 = _mm_add_epi16(stp1_3, stp2_4);
    697   tmp1 = _mm_add_epi16(stp1_2, stp1_5);
    698   tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
    699   tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
    700 
    701   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
    702 
    703   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
    704         in5, in6, in7);
    705   // Final rounding and shift
    706   in0 = _mm_adds_epi16(in0, final_rounding);
    707   in1 = _mm_adds_epi16(in1, final_rounding);
    708   in2 = _mm_adds_epi16(in2, final_rounding);
    709   in3 = _mm_adds_epi16(in3, final_rounding);
    710   in4 = _mm_adds_epi16(in4, final_rounding);
    711   in5 = _mm_adds_epi16(in5, final_rounding);
    712   in6 = _mm_adds_epi16(in6, final_rounding);
    713   in7 = _mm_adds_epi16(in7, final_rounding);
    714 
    715   in0 = _mm_srai_epi16(in0, 5);
    716   in1 = _mm_srai_epi16(in1, 5);
    717   in2 = _mm_srai_epi16(in2, 5);
    718   in3 = _mm_srai_epi16(in3, 5);
    719   in4 = _mm_srai_epi16(in4, 5);
    720   in5 = _mm_srai_epi16(in5, 5);
    721   in6 = _mm_srai_epi16(in6, 5);
    722   in7 = _mm_srai_epi16(in7, 5);
    723 
    724   RECON_AND_STORE(dest + 0 * stride, in0);
    725   RECON_AND_STORE(dest + 1 * stride, in1);
    726   RECON_AND_STORE(dest + 2 * stride, in2);
    727   RECON_AND_STORE(dest + 3 * stride, in3);
    728   RECON_AND_STORE(dest + 4 * stride, in4);
    729   RECON_AND_STORE(dest + 5 * stride, in5);
    730   RECON_AND_STORE(dest + 6 * stride, in6);
    731   RECON_AND_STORE(dest + 7 * stride, in7);
    732 }
    733 
    734 #define IDCT16                                                                 \
    735   /* Stage2 */                                                                 \
    736   {                                                                            \
    737     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
    738     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
    739     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
    740     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
    741     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
    742     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
    743     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
    744     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
    745                                                                                \
    746     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
    747                            stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
    748                                                                                \
    749     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
    750                            stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
    751   }                                                                            \
    752                                                                                \
    753   /* Stage3 */                                                                 \
    754   {                                                                            \
    755     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
    756     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
    757     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
    758     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
    759                                                                                \
    760     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
    761                            stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
    762                                                                                \
    763     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
    764     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
    765     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
    766     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
    767                                                                                \
    768     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
    769     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
    770     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
    771     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
    772   }                                                                            \
    773                                                                                \
    774   /* Stage4 */                                                                 \
    775   {                                                                            \
    776     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
    777     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
    778     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
    779     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
    780                                                                                \
    781     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
    782     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
    783     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    784     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    785                                                                                \
    786     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
    787                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
    788                                                                                \
    789     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
    790     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
    791     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
    792     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
    793                                                                                \
    794     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
    795                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
    796                            stp2_13)                                            \
    797   }                                                                            \
    798                                                                                \
    799   /* Stage5 */                                                                 \
    800   {                                                                            \
    801     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
    802     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
    803                                                                                \
    804     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
    805     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
    806     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
    807     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
    808                                                                                \
    809     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
    810     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
    811     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
    812     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
    813                                                                                \
    814     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
    815     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
    816     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
    817     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
    818                                                                                \
    819     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
    820     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
    821     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
    822     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
    823                                                                                \
    824     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
    825     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
    826                                                                                \
    827     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
    828     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
    829     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
    830     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
    831                                                                                \
    832     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
    833     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
    834     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
    835     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
    836   }                                                                            \
    837                                                                                \
    838   /* Stage6 */                                                                 \
    839   {                                                                            \
    840     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    841     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    842     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
    843     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
    844                                                                                \
    845     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
    846     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
    847     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
    848     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
    849     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
    850     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
    851     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
    852     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
    853                                                                                \
    854     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
    855                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
    856                            stp2_12)                                            \
    857   }
    858 
    859 #define IDCT16_10                                                              \
    860   /* Stage2 */                                                                 \
    861   {                                                                            \
    862     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
    863     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
    864     const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
    865     const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
    866                                                                                \
    867     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
    868                            stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
    869                            stp1_12_0)                                          \
    870   }                                                                            \
    871                                                                                \
    872   /* Stage3 */                                                                 \
    873   {                                                                            \
    874     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
    875     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
    876                                                                                \
    877     MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
    878                                                                                \
    879     stp1_9 = stp1_8_0;                                                         \
    880     stp1_10 = stp1_11;                                                         \
    881                                                                                \
    882     stp1_13 = stp1_12_0;                                                       \
    883     stp1_14 = stp1_15;                                                         \
    884   }                                                                            \
    885                                                                                \
    886   /* Stage4 */                                                                 \
    887   {                                                                            \
    888     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
    889     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
    890                                                                                \
    891     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
    892     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
    893     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    894     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    895                                                                                \
    896     MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
    897     stp2_5 = stp2_4;                                                           \
    898     stp2_6 = stp2_7;                                                           \
    899                                                                                \
    900     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
    901                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
    902                            stp2_13)                                            \
    903   }                                                                            \
    904                                                                                \
    905   /* Stage5 */                                                                 \
    906   {                                                                            \
    907     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
    908     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
    909                                                                                \
    910     stp1_2 = stp1_1;                                                           \
    911     stp1_3 = stp1_0;                                                           \
    912                                                                                \
    913     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
    914     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
    915     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
    916     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
    917                                                                                \
    918     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
    919     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
    920     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
    921     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
    922                                                                                \
    923     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
    924     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
    925     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
    926     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
    927                                                                                \
    928     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
    929     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
    930                                                                                \
    931     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
    932     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
    933     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
    934     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
    935                                                                                \
    936     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
    937     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
    938     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
    939     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
    940   }                                                                            \
    941                                                                                \
    942   /* Stage6 */                                                                 \
    943   {                                                                            \
    944     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    945     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    946     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
    947     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
    948                                                                                \
    949     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
    950     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
    951     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
    952     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
    953     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
    954     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
    955     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
    956     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
    957                                                                                \
    958     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
    959                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
    960                            stp2_12)                                            \
    961   }
    962 
    963 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
    964                                 int stride) {
    965   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    966   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    967   const __m128i zero = _mm_setzero_si128();
    968 
    969   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    970   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
    971   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    972   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
    973   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    974   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
    975   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    976   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
    977 
    978   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    979   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    980   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    981   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
    982 
    983   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    984   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    985   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    986   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    987   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    988   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    989   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    990   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    991 
    992   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    993 
    994   __m128i in[16], l[16], r[16], *curr1;
    995   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
    996       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
    997       stp1_8_0, stp1_12_0;
    998   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
    999       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   1000   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1001   int i;
   1002 
   1003   curr1 = l;
   1004   for (i = 0; i < 2; i++) {
   1005     // 1-D idct
   1006 
   1007     // Load input data.
   1008     in[0] = load_input_data(input);
   1009     in[8] = load_input_data(input + 8 * 1);
   1010     in[1] = load_input_data(input + 8 * 2);
   1011     in[9] = load_input_data(input + 8 * 3);
   1012     in[2] = load_input_data(input + 8 * 4);
   1013     in[10] = load_input_data(input + 8 * 5);
   1014     in[3] = load_input_data(input + 8 * 6);
   1015     in[11] = load_input_data(input + 8 * 7);
   1016     in[4] = load_input_data(input + 8 * 8);
   1017     in[12] = load_input_data(input + 8 * 9);
   1018     in[5] = load_input_data(input + 8 * 10);
   1019     in[13] = load_input_data(input + 8 * 11);
   1020     in[6] = load_input_data(input + 8 * 12);
   1021     in[14] = load_input_data(input + 8 * 13);
   1022     in[7] = load_input_data(input + 8 * 14);
   1023     in[15] = load_input_data(input + 8 * 15);
   1024 
   1025     array_transpose_8x8(in, in);
   1026     array_transpose_8x8(in + 8, in + 8);
   1027 
   1028     IDCT16
   1029 
   1030     // Stage7
   1031     curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
   1032     curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
   1033     curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
   1034     curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
   1035     curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
   1036     curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
   1037     curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
   1038     curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
   1039     curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
   1040     curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
   1041     curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
   1042     curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
   1043     curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
   1044     curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
   1045     curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
   1046     curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
   1047 
   1048     curr1 = r;
   1049     input += 128;
   1050   }
   1051   for (i = 0; i < 2; i++) {
   1052     int j;
   1053     // 1-D idct
   1054     array_transpose_8x8(l + i * 8, in);
   1055     array_transpose_8x8(r + i * 8, in + 8);
   1056 
   1057     IDCT16
   1058 
   1059     // 2-D
   1060     in[0] = _mm_add_epi16(stp2_0, stp1_15);
   1061     in[1] = _mm_add_epi16(stp2_1, stp1_14);
   1062     in[2] = _mm_add_epi16(stp2_2, stp2_13);
   1063     in[3] = _mm_add_epi16(stp2_3, stp2_12);
   1064     in[4] = _mm_add_epi16(stp2_4, stp2_11);
   1065     in[5] = _mm_add_epi16(stp2_5, stp2_10);
   1066     in[6] = _mm_add_epi16(stp2_6, stp1_9);
   1067     in[7] = _mm_add_epi16(stp2_7, stp1_8);
   1068     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
   1069     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
   1070     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
   1071     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
   1072     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
   1073     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
   1074     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
   1075     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
   1076 
   1077     for (j = 0; j < 16; ++j) {
   1078       // Final rounding and shift
   1079       in[j] = _mm_adds_epi16(in[j], final_rounding);
   1080       in[j] = _mm_srai_epi16(in[j], 6);
   1081       RECON_AND_STORE(dest + j * stride, in[j]);
   1082     }
   1083 
   1084     dest += 8;
   1085   }
   1086 }
   1087 
   1088 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   1089                               int stride) {
   1090   __m128i dc_value;
   1091   const __m128i zero = _mm_setzero_si128();
   1092   int a, i;
   1093 
   1094   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
   1095   a = (int)dct_const_round_shift(a * cospi_16_64);
   1096   a = ROUND_POWER_OF_TWO(a, 6);
   1097 
   1098   dc_value = _mm_set1_epi16(a);
   1099 
   1100   for (i = 0; i < 16; ++i) {
   1101     RECON_AND_STORE(dest + 0, dc_value);
   1102     RECON_AND_STORE(dest + 8, dc_value);
   1103     dest += stride;
   1104   }
   1105 }
   1106 
   1107 static void iadst16_8col(__m128i *in) {
   1108   // perform 16x16 1-D ADST for 8 columns
   1109   __m128i s[16], x[16], u[32], v[32];
   1110   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
   1111   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   1112   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
   1113   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   1114   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
   1115   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   1116   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
   1117   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   1118   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
   1119   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   1120   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   1121   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   1122   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
   1123   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   1124   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
   1125   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   1126   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   1127   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   1128   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
   1129   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   1130   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
   1131   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
   1132   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   1133   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   1134   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   1135   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
   1136   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   1137   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1138   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1139   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1140   const __m128i kZero = _mm_set1_epi16(0);
   1141 
   1142   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   1143   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
   1144   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
   1145   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
   1146   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
   1147   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
   1148   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
   1149   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
   1150   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
   1151   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
   1152   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
   1153   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
   1154   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
   1155   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
   1156   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
   1157   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
   1158 
   1159   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
   1160   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
   1161   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
   1162   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
   1163   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
   1164   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
   1165   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
   1166   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
   1167   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
   1168   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
   1169   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
   1170   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
   1171   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
   1172   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
   1173   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
   1174   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
   1175   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
   1176   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
   1177   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
   1178   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
   1179   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
   1180   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
   1181   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
   1182   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
   1183   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
   1184   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
   1185   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
   1186   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
   1187   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
   1188   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
   1189   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
   1190   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
   1191 
   1192   u[0] = _mm_add_epi32(v[0], v[16]);
   1193   u[1] = _mm_add_epi32(v[1], v[17]);
   1194   u[2] = _mm_add_epi32(v[2], v[18]);
   1195   u[3] = _mm_add_epi32(v[3], v[19]);
   1196   u[4] = _mm_add_epi32(v[4], v[20]);
   1197   u[5] = _mm_add_epi32(v[5], v[21]);
   1198   u[6] = _mm_add_epi32(v[6], v[22]);
   1199   u[7] = _mm_add_epi32(v[7], v[23]);
   1200   u[8] = _mm_add_epi32(v[8], v[24]);
   1201   u[9] = _mm_add_epi32(v[9], v[25]);
   1202   u[10] = _mm_add_epi32(v[10], v[26]);
   1203   u[11] = _mm_add_epi32(v[11], v[27]);
   1204   u[12] = _mm_add_epi32(v[12], v[28]);
   1205   u[13] = _mm_add_epi32(v[13], v[29]);
   1206   u[14] = _mm_add_epi32(v[14], v[30]);
   1207   u[15] = _mm_add_epi32(v[15], v[31]);
   1208   u[16] = _mm_sub_epi32(v[0], v[16]);
   1209   u[17] = _mm_sub_epi32(v[1], v[17]);
   1210   u[18] = _mm_sub_epi32(v[2], v[18]);
   1211   u[19] = _mm_sub_epi32(v[3], v[19]);
   1212   u[20] = _mm_sub_epi32(v[4], v[20]);
   1213   u[21] = _mm_sub_epi32(v[5], v[21]);
   1214   u[22] = _mm_sub_epi32(v[6], v[22]);
   1215   u[23] = _mm_sub_epi32(v[7], v[23]);
   1216   u[24] = _mm_sub_epi32(v[8], v[24]);
   1217   u[25] = _mm_sub_epi32(v[9], v[25]);
   1218   u[26] = _mm_sub_epi32(v[10], v[26]);
   1219   u[27] = _mm_sub_epi32(v[11], v[27]);
   1220   u[28] = _mm_sub_epi32(v[12], v[28]);
   1221   u[29] = _mm_sub_epi32(v[13], v[29]);
   1222   u[30] = _mm_sub_epi32(v[14], v[30]);
   1223   u[31] = _mm_sub_epi32(v[15], v[31]);
   1224 
   1225   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1226   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1227   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1228   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1229   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1230   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1231   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1232   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1233   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   1234   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   1235   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   1236   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   1237   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   1238   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   1239   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   1240   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   1241   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
   1242   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
   1243   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
   1244   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
   1245   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
   1246   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
   1247   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
   1248   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
   1249   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
   1250   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
   1251   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
   1252   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
   1253   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
   1254   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
   1255   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
   1256   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
   1257 
   1258   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   1259   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   1260   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   1261   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   1262   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   1263   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   1264   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   1265   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   1266   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   1267   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   1268   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   1269   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   1270   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   1271   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   1272   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   1273   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   1274   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
   1275   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
   1276   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
   1277   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
   1278   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
   1279   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
   1280   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
   1281   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
   1282   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
   1283   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
   1284   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
   1285   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
   1286   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
   1287   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
   1288   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
   1289   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
   1290 
   1291   s[0] = _mm_packs_epi32(u[0], u[1]);
   1292   s[1] = _mm_packs_epi32(u[2], u[3]);
   1293   s[2] = _mm_packs_epi32(u[4], u[5]);
   1294   s[3] = _mm_packs_epi32(u[6], u[7]);
   1295   s[4] = _mm_packs_epi32(u[8], u[9]);
   1296   s[5] = _mm_packs_epi32(u[10], u[11]);
   1297   s[6] = _mm_packs_epi32(u[12], u[13]);
   1298   s[7] = _mm_packs_epi32(u[14], u[15]);
   1299   s[8] = _mm_packs_epi32(u[16], u[17]);
   1300   s[9] = _mm_packs_epi32(u[18], u[19]);
   1301   s[10] = _mm_packs_epi32(u[20], u[21]);
   1302   s[11] = _mm_packs_epi32(u[22], u[23]);
   1303   s[12] = _mm_packs_epi32(u[24], u[25]);
   1304   s[13] = _mm_packs_epi32(u[26], u[27]);
   1305   s[14] = _mm_packs_epi32(u[28], u[29]);
   1306   s[15] = _mm_packs_epi32(u[30], u[31]);
   1307 
   1308   // stage 2
   1309   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
   1310   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
   1311   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
   1312   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
   1313   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
   1314   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
   1315   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   1316   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   1317 
   1318   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
   1319   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
   1320   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
   1321   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
   1322   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
   1323   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
   1324   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
   1325   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
   1326   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
   1327   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
   1328   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
   1329   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
   1330   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
   1331   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
   1332   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
   1333   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
   1334 
   1335   u[0] = _mm_add_epi32(v[0], v[8]);
   1336   u[1] = _mm_add_epi32(v[1], v[9]);
   1337   u[2] = _mm_add_epi32(v[2], v[10]);
   1338   u[3] = _mm_add_epi32(v[3], v[11]);
   1339   u[4] = _mm_add_epi32(v[4], v[12]);
   1340   u[5] = _mm_add_epi32(v[5], v[13]);
   1341   u[6] = _mm_add_epi32(v[6], v[14]);
   1342   u[7] = _mm_add_epi32(v[7], v[15]);
   1343   u[8] = _mm_sub_epi32(v[0], v[8]);
   1344   u[9] = _mm_sub_epi32(v[1], v[9]);
   1345   u[10] = _mm_sub_epi32(v[2], v[10]);
   1346   u[11] = _mm_sub_epi32(v[3], v[11]);
   1347   u[12] = _mm_sub_epi32(v[4], v[12]);
   1348   u[13] = _mm_sub_epi32(v[5], v[13]);
   1349   u[14] = _mm_sub_epi32(v[6], v[14]);
   1350   u[15] = _mm_sub_epi32(v[7], v[15]);
   1351 
   1352   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1353   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1354   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1355   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1356   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1357   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1358   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1359   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1360   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   1361   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   1362   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   1363   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   1364   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   1365   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   1366   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   1367   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   1368 
   1369   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   1370   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   1371   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   1372   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   1373   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   1374   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   1375   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   1376   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   1377   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   1378   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   1379   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   1380   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   1381   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   1382   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   1383   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   1384   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   1385 
   1386   x[0] = _mm_add_epi16(s[0], s[4]);
   1387   x[1] = _mm_add_epi16(s[1], s[5]);
   1388   x[2] = _mm_add_epi16(s[2], s[6]);
   1389   x[3] = _mm_add_epi16(s[3], s[7]);
   1390   x[4] = _mm_sub_epi16(s[0], s[4]);
   1391   x[5] = _mm_sub_epi16(s[1], s[5]);
   1392   x[6] = _mm_sub_epi16(s[2], s[6]);
   1393   x[7] = _mm_sub_epi16(s[3], s[7]);
   1394   x[8] = _mm_packs_epi32(u[0], u[1]);
   1395   x[9] = _mm_packs_epi32(u[2], u[3]);
   1396   x[10] = _mm_packs_epi32(u[4], u[5]);
   1397   x[11] = _mm_packs_epi32(u[6], u[7]);
   1398   x[12] = _mm_packs_epi32(u[8], u[9]);
   1399   x[13] = _mm_packs_epi32(u[10], u[11]);
   1400   x[14] = _mm_packs_epi32(u[12], u[13]);
   1401   x[15] = _mm_packs_epi32(u[14], u[15]);
   1402 
   1403   // stage 3
   1404   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
   1405   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
   1406   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
   1407   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
   1408   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
   1409   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
   1410   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
   1411   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
   1412 
   1413   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
   1414   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
   1415   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
   1416   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
   1417   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
   1418   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
   1419   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
   1420   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
   1421   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
   1422   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
   1423   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
   1424   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
   1425   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
   1426   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
   1427   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
   1428   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
   1429 
   1430   u[0] = _mm_add_epi32(v[0], v[4]);
   1431   u[1] = _mm_add_epi32(v[1], v[5]);
   1432   u[2] = _mm_add_epi32(v[2], v[6]);
   1433   u[3] = _mm_add_epi32(v[3], v[7]);
   1434   u[4] = _mm_sub_epi32(v[0], v[4]);
   1435   u[5] = _mm_sub_epi32(v[1], v[5]);
   1436   u[6] = _mm_sub_epi32(v[2], v[6]);
   1437   u[7] = _mm_sub_epi32(v[3], v[7]);
   1438   u[8] = _mm_add_epi32(v[8], v[12]);
   1439   u[9] = _mm_add_epi32(v[9], v[13]);
   1440   u[10] = _mm_add_epi32(v[10], v[14]);
   1441   u[11] = _mm_add_epi32(v[11], v[15]);
   1442   u[12] = _mm_sub_epi32(v[8], v[12]);
   1443   u[13] = _mm_sub_epi32(v[9], v[13]);
   1444   u[14] = _mm_sub_epi32(v[10], v[14]);
   1445   u[15] = _mm_sub_epi32(v[11], v[15]);
   1446 
   1447   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1448   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1449   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1450   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1451   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1452   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1453   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1454   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1455   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   1456   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   1457   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   1458   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   1459   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   1460   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   1461   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   1462   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   1463 
   1464   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1465   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1466   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1467   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1468   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1469   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1470   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1471   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1472   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   1473   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   1474   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   1475   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   1476   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   1477   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   1478   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   1479   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   1480 
   1481   s[0] = _mm_add_epi16(x[0], x[2]);
   1482   s[1] = _mm_add_epi16(x[1], x[3]);
   1483   s[2] = _mm_sub_epi16(x[0], x[2]);
   1484   s[3] = _mm_sub_epi16(x[1], x[3]);
   1485   s[4] = _mm_packs_epi32(v[0], v[1]);
   1486   s[5] = _mm_packs_epi32(v[2], v[3]);
   1487   s[6] = _mm_packs_epi32(v[4], v[5]);
   1488   s[7] = _mm_packs_epi32(v[6], v[7]);
   1489   s[8] = _mm_add_epi16(x[8], x[10]);
   1490   s[9] = _mm_add_epi16(x[9], x[11]);
   1491   s[10] = _mm_sub_epi16(x[8], x[10]);
   1492   s[11] = _mm_sub_epi16(x[9], x[11]);
   1493   s[12] = _mm_packs_epi32(v[8], v[9]);
   1494   s[13] = _mm_packs_epi32(v[10], v[11]);
   1495   s[14] = _mm_packs_epi32(v[12], v[13]);
   1496   s[15] = _mm_packs_epi32(v[14], v[15]);
   1497 
   1498   // stage 4
   1499   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
   1500   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
   1501   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
   1502   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
   1503   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
   1504   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
   1505   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   1506   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   1507 
   1508   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
   1509   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
   1510   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   1511   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
   1512   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   1513   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   1514   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   1515   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   1516   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
   1517   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
   1518   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
   1519   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
   1520   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
   1521   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
   1522   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
   1523   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
   1524 
   1525   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1526   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1527   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1528   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1529   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1530   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1531   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1532   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1533   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   1534   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   1535   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   1536   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   1537   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   1538   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   1539   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   1540   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   1541 
   1542   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1543   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1544   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1545   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1546   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1547   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1548   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1549   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1550   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   1551   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   1552   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   1553   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   1554   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   1555   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   1556   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   1557   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   1558 
   1559   in[0] = s[0];
   1560   in[1] = _mm_sub_epi16(kZero, s[8]);
   1561   in[2] = s[12];
   1562   in[3] = _mm_sub_epi16(kZero, s[4]);
   1563   in[4] = _mm_packs_epi32(v[4], v[5]);
   1564   in[5] = _mm_packs_epi32(v[12], v[13]);
   1565   in[6] = _mm_packs_epi32(v[8], v[9]);
   1566   in[7] = _mm_packs_epi32(v[0], v[1]);
   1567   in[8] = _mm_packs_epi32(v[2], v[3]);
   1568   in[9] = _mm_packs_epi32(v[10], v[11]);
   1569   in[10] = _mm_packs_epi32(v[14], v[15]);
   1570   in[11] = _mm_packs_epi32(v[6], v[7]);
   1571   in[12] = s[5];
   1572   in[13] = _mm_sub_epi16(kZero, s[13]);
   1573   in[14] = s[9];
   1574   in[15] = _mm_sub_epi16(kZero, s[1]);
   1575 }
   1576 
   1577 static void idct16_8col(__m128i *in) {
   1578   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   1579   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   1580   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
   1581   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
   1582   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
   1583   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
   1584   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   1585   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
   1586   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   1587   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   1588   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   1589   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
   1590   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   1591   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1592   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   1593   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   1594   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1595   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1596   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   1597   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1598   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1599   __m128i v[16], u[16], s[16], t[16];
   1600 
   1601   // stage 1
   1602   s[0] = in[0];
   1603   s[1] = in[8];
   1604   s[2] = in[4];
   1605   s[3] = in[12];
   1606   s[4] = in[2];
   1607   s[5] = in[10];
   1608   s[6] = in[6];
   1609   s[7] = in[14];
   1610   s[8] = in[1];
   1611   s[9] = in[9];
   1612   s[10] = in[5];
   1613   s[11] = in[13];
   1614   s[12] = in[3];
   1615   s[13] = in[11];
   1616   s[14] = in[7];
   1617   s[15] = in[15];
   1618 
   1619   // stage 2
   1620   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
   1621   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
   1622   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
   1623   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
   1624   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
   1625   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
   1626   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
   1627   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
   1628 
   1629   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
   1630   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
   1631   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
   1632   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
   1633   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
   1634   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
   1635   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
   1636   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
   1637   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
   1638   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
   1639   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
   1640   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
   1641   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
   1642   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
   1643   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
   1644   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
   1645 
   1646   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1647   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1648   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1649   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1650   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1651   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1652   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1653   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1654   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   1655   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   1656   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   1657   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   1658   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   1659   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   1660   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   1661   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   1662 
   1663   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1664   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1665   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1666   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1667   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1668   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1669   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1670   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1671   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   1672   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   1673   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   1674   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   1675   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   1676   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   1677   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   1678   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   1679 
   1680   s[8] = _mm_packs_epi32(u[0], u[1]);
   1681   s[15] = _mm_packs_epi32(u[2], u[3]);
   1682   s[9] = _mm_packs_epi32(u[4], u[5]);
   1683   s[14] = _mm_packs_epi32(u[6], u[7]);
   1684   s[10] = _mm_packs_epi32(u[8], u[9]);
   1685   s[13] = _mm_packs_epi32(u[10], u[11]);
   1686   s[11] = _mm_packs_epi32(u[12], u[13]);
   1687   s[12] = _mm_packs_epi32(u[14], u[15]);
   1688 
   1689   // stage 3
   1690   t[0] = s[0];
   1691   t[1] = s[1];
   1692   t[2] = s[2];
   1693   t[3] = s[3];
   1694   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
   1695   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
   1696   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
   1697   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
   1698 
   1699   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
   1700   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
   1701   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
   1702   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
   1703   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
   1704   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
   1705   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
   1706   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
   1707 
   1708   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1709   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1710   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1711   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1712   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1713   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1714   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1715   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1716 
   1717   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1718   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1719   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1720   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1721   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1722   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1723   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1724   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1725 
   1726   t[4] = _mm_packs_epi32(u[0], u[1]);
   1727   t[7] = _mm_packs_epi32(u[2], u[3]);
   1728   t[5] = _mm_packs_epi32(u[4], u[5]);
   1729   t[6] = _mm_packs_epi32(u[6], u[7]);
   1730   t[8] = _mm_add_epi16(s[8], s[9]);
   1731   t[9] = _mm_sub_epi16(s[8], s[9]);
   1732   t[10] = _mm_sub_epi16(s[11], s[10]);
   1733   t[11] = _mm_add_epi16(s[10], s[11]);
   1734   t[12] = _mm_add_epi16(s[12], s[13]);
   1735   t[13] = _mm_sub_epi16(s[12], s[13]);
   1736   t[14] = _mm_sub_epi16(s[15], s[14]);
   1737   t[15] = _mm_add_epi16(s[14], s[15]);
   1738 
   1739   // stage 4
   1740   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
   1741   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
   1742   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
   1743   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
   1744   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
   1745   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
   1746   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
   1747   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
   1748 
   1749   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   1750   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   1751   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   1752   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
   1753   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
   1754   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
   1755   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
   1756   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
   1757   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
   1758   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
   1759   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
   1760   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
   1761   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
   1762   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
   1763   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
   1764   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
   1765 
   1766   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1767   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1768   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1769   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1770   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1771   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1772   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1773   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1774   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   1775   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   1776   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   1777   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   1778   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   1779   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   1780   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   1781   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   1782 
   1783   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1784   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1785   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1786   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1787   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1788   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1789   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1790   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1791   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   1792   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   1793   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   1794   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   1795   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   1796   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   1797   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   1798   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   1799 
   1800   s[0] = _mm_packs_epi32(u[0], u[1]);
   1801   s[1] = _mm_packs_epi32(u[2], u[3]);
   1802   s[2] = _mm_packs_epi32(u[4], u[5]);
   1803   s[3] = _mm_packs_epi32(u[6], u[7]);
   1804   s[4] = _mm_add_epi16(t[4], t[5]);
   1805   s[5] = _mm_sub_epi16(t[4], t[5]);
   1806   s[6] = _mm_sub_epi16(t[7], t[6]);
   1807   s[7] = _mm_add_epi16(t[6], t[7]);
   1808   s[8] = t[8];
   1809   s[15] = t[15];
   1810   s[9] = _mm_packs_epi32(u[8], u[9]);
   1811   s[14] = _mm_packs_epi32(u[10], u[11]);
   1812   s[10] = _mm_packs_epi32(u[12], u[13]);
   1813   s[13] = _mm_packs_epi32(u[14], u[15]);
   1814   s[11] = t[11];
   1815   s[12] = t[12];
   1816 
   1817   // stage 5
   1818   t[0] = _mm_add_epi16(s[0], s[3]);
   1819   t[1] = _mm_add_epi16(s[1], s[2]);
   1820   t[2] = _mm_sub_epi16(s[1], s[2]);
   1821   t[3] = _mm_sub_epi16(s[0], s[3]);
   1822   t[4] = s[4];
   1823   t[7] = s[7];
   1824 
   1825   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
   1826   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
   1827   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   1828   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   1829   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   1830   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   1831   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1832   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1833   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1834   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1835   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1836   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1837   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1838   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1839   t[5] = _mm_packs_epi32(u[0], u[1]);
   1840   t[6] = _mm_packs_epi32(u[2], u[3]);
   1841 
   1842   t[8] = _mm_add_epi16(s[8], s[11]);
   1843   t[9] = _mm_add_epi16(s[9], s[10]);
   1844   t[10] = _mm_sub_epi16(s[9], s[10]);
   1845   t[11] = _mm_sub_epi16(s[8], s[11]);
   1846   t[12] = _mm_sub_epi16(s[15], s[12]);
   1847   t[13] = _mm_sub_epi16(s[14], s[13]);
   1848   t[14] = _mm_add_epi16(s[13], s[14]);
   1849   t[15] = _mm_add_epi16(s[12], s[15]);
   1850 
   1851   // stage 6
   1852   s[0] = _mm_add_epi16(t[0], t[7]);
   1853   s[1] = _mm_add_epi16(t[1], t[6]);
   1854   s[2] = _mm_add_epi16(t[2], t[5]);
   1855   s[3] = _mm_add_epi16(t[3], t[4]);
   1856   s[4] = _mm_sub_epi16(t[3], t[4]);
   1857   s[5] = _mm_sub_epi16(t[2], t[5]);
   1858   s[6] = _mm_sub_epi16(t[1], t[6]);
   1859   s[7] = _mm_sub_epi16(t[0], t[7]);
   1860   s[8] = t[8];
   1861   s[9] = t[9];
   1862 
   1863   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
   1864   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
   1865   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
   1866   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
   1867 
   1868   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   1869   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   1870   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   1871   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   1872   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   1873   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   1874   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   1875   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   1876 
   1877   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1878   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1879   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1880   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1881   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1882   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1883   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1884   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1885 
   1886   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1887   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1888   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1889   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1890   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1891   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1892   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1893   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1894 
   1895   s[10] = _mm_packs_epi32(u[0], u[1]);
   1896   s[13] = _mm_packs_epi32(u[2], u[3]);
   1897   s[11] = _mm_packs_epi32(u[4], u[5]);
   1898   s[12] = _mm_packs_epi32(u[6], u[7]);
   1899   s[14] = t[14];
   1900   s[15] = t[15];
   1901 
   1902   // stage 7
   1903   in[0] = _mm_add_epi16(s[0], s[15]);
   1904   in[1] = _mm_add_epi16(s[1], s[14]);
   1905   in[2] = _mm_add_epi16(s[2], s[13]);
   1906   in[3] = _mm_add_epi16(s[3], s[12]);
   1907   in[4] = _mm_add_epi16(s[4], s[11]);
   1908   in[5] = _mm_add_epi16(s[5], s[10]);
   1909   in[6] = _mm_add_epi16(s[6], s[9]);
   1910   in[7] = _mm_add_epi16(s[7], s[8]);
   1911   in[8] = _mm_sub_epi16(s[7], s[8]);
   1912   in[9] = _mm_sub_epi16(s[6], s[9]);
   1913   in[10] = _mm_sub_epi16(s[5], s[10]);
   1914   in[11] = _mm_sub_epi16(s[4], s[11]);
   1915   in[12] = _mm_sub_epi16(s[3], s[12]);
   1916   in[13] = _mm_sub_epi16(s[2], s[13]);
   1917   in[14] = _mm_sub_epi16(s[1], s[14]);
   1918   in[15] = _mm_sub_epi16(s[0], s[15]);
   1919 }
   1920 
   1921 void idct16_sse2(__m128i *in0, __m128i *in1) {
   1922   array_transpose_16x16(in0, in1);
   1923   idct16_8col(in0);
   1924   idct16_8col(in1);
   1925 }
   1926 
   1927 void iadst16_sse2(__m128i *in0, __m128i *in1) {
   1928   array_transpose_16x16(in0, in1);
   1929   iadst16_8col(in0);
   1930   iadst16_8col(in1);
   1931 }
   1932 
   1933 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
   1934                                int stride) {
   1935   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1936   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   1937   const __m128i zero = _mm_setzero_si128();
   1938 
   1939   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   1940   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   1941   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   1942   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   1943 
   1944   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   1945   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   1946 
   1947   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   1948   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1949   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1950   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1951   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   1952   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1953 
   1954   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1955   __m128i in[16], l[16];
   1956   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
   1957       stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
   1958       stp1_12_0;
   1959   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   1960       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
   1961   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1962   int i;
   1963   // First 1-D inverse DCT
   1964   // Load input data.
   1965   in[0] = load_input_data(input);
   1966   in[1] = load_input_data(input + 8 * 2);
   1967   in[2] = load_input_data(input + 8 * 4);
   1968   in[3] = load_input_data(input + 8 * 6);
   1969 
   1970   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
   1971 
   1972   // Stage2
   1973   {
   1974     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
   1975     const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
   1976 
   1977     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
   1978     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
   1979     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
   1980     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
   1981 
   1982     tmp0 = _mm_add_epi32(tmp0, rounding);
   1983     tmp2 = _mm_add_epi32(tmp2, rounding);
   1984     tmp5 = _mm_add_epi32(tmp5, rounding);
   1985     tmp7 = _mm_add_epi32(tmp7, rounding);
   1986 
   1987     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   1988     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   1989     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
   1990     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
   1991 
   1992     stp2_8 = _mm_packs_epi32(tmp0, tmp2);
   1993     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
   1994   }
   1995 
   1996   // Stage3
   1997   {
   1998     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
   1999 
   2000     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
   2001     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
   2002 
   2003     tmp0 = _mm_add_epi32(tmp0, rounding);
   2004     tmp2 = _mm_add_epi32(tmp2, rounding);
   2005     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2006     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2007 
   2008     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
   2009     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
   2010 
   2011     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
   2012   }
   2013 
   2014   // Stage4
   2015   {
   2016     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
   2017     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
   2018     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
   2019 
   2020     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
   2021     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
   2022     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
   2023     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
   2024     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
   2025     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
   2026 
   2027     tmp0 = _mm_add_epi32(tmp0, rounding);
   2028     tmp2 = _mm_add_epi32(tmp2, rounding);
   2029     tmp1 = _mm_add_epi32(tmp1, rounding);
   2030     tmp3 = _mm_add_epi32(tmp3, rounding);
   2031     tmp5 = _mm_add_epi32(tmp5, rounding);
   2032     tmp7 = _mm_add_epi32(tmp7, rounding);
   2033 
   2034     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2035     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2036     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
   2037     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
   2038     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
   2039     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
   2040 
   2041     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
   2042     stp1_1 = _mm_packs_epi32(tmp2, tmp2);
   2043     stp2_9 = _mm_packs_epi32(tmp1, tmp3);
   2044     stp2_10 = _mm_packs_epi32(tmp5, tmp7);
   2045 
   2046     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
   2047   }
   2048 
   2049   // Stage5 and Stage6
   2050   {
   2051     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
   2052     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
   2053     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
   2054     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
   2055 
   2056     stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
   2057     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
   2058     stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
   2059     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
   2060 
   2061     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
   2062     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
   2063     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
   2064     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
   2065   }
   2066 
   2067   // Stage6
   2068   {
   2069     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
   2070     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
   2071     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
   2072 
   2073     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
   2074     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
   2075     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
   2076     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
   2077     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
   2078     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
   2079 
   2080     tmp1 = _mm_add_epi32(tmp1, rounding);
   2081     tmp3 = _mm_add_epi32(tmp3, rounding);
   2082     tmp0 = _mm_add_epi32(tmp0, rounding);
   2083     tmp2 = _mm_add_epi32(tmp2, rounding);
   2084     tmp4 = _mm_add_epi32(tmp4, rounding);
   2085     tmp6 = _mm_add_epi32(tmp6, rounding);
   2086 
   2087     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
   2088     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
   2089     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2090     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2091     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
   2092     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
   2093 
   2094     stp1_6 = _mm_packs_epi32(tmp3, tmp1);
   2095 
   2096     stp2_10 = _mm_packs_epi32(tmp0, zero);
   2097     stp2_13 = _mm_packs_epi32(tmp2, zero);
   2098     stp2_11 = _mm_packs_epi32(tmp4, zero);
   2099     stp2_12 = _mm_packs_epi32(tmp6, zero);
   2100 
   2101     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
   2102     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
   2103     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
   2104     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
   2105 
   2106     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
   2107     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
   2108     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
   2109     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
   2110     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
   2111     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
   2112     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
   2113     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
   2114   }
   2115 
   2116   // Stage7. Left 8x16 only.
   2117   l[0] = _mm_add_epi16(stp2_0, stp1_15);
   2118   l[1] = _mm_add_epi16(stp2_1, stp1_14);
   2119   l[2] = _mm_add_epi16(stp2_2, stp2_13);
   2120   l[3] = _mm_add_epi16(stp2_3, stp2_12);
   2121   l[4] = _mm_add_epi16(stp2_4, stp2_11);
   2122   l[5] = _mm_add_epi16(stp2_5, stp2_10);
   2123   l[6] = _mm_add_epi16(stp2_6, stp1_9);
   2124   l[7] = _mm_add_epi16(stp2_7, stp1_8);
   2125   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
   2126   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
   2127   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
   2128   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
   2129   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
   2130   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
   2131   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
   2132   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
   2133 
   2134   // Second 1-D inverse transform, performed per 8x16 block
   2135   for (i = 0; i < 2; i++) {
   2136     int j;
   2137     array_transpose_4X8(l + 8 * i, in);
   2138 
   2139     IDCT16_10
   2140 
   2141     // Stage7
   2142     in[0] = _mm_add_epi16(stp2_0, stp1_15);
   2143     in[1] = _mm_add_epi16(stp2_1, stp1_14);
   2144     in[2] = _mm_add_epi16(stp2_2, stp2_13);
   2145     in[3] = _mm_add_epi16(stp2_3, stp2_12);
   2146     in[4] = _mm_add_epi16(stp2_4, stp2_11);
   2147     in[5] = _mm_add_epi16(stp2_5, stp2_10);
   2148     in[6] = _mm_add_epi16(stp2_6, stp1_9);
   2149     in[7] = _mm_add_epi16(stp2_7, stp1_8);
   2150     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
   2151     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
   2152     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
   2153     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
   2154     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
   2155     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
   2156     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
   2157     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
   2158 
   2159     for (j = 0; j < 16; ++j) {
   2160       // Final rounding and shift
   2161       in[j] = _mm_adds_epi16(in[j], final_rounding);
   2162       in[j] = _mm_srai_epi16(in[j], 6);
   2163       RECON_AND_STORE(dest + j * stride, in[j]);
   2164     }
   2165 
   2166     dest += 8;
   2167   }
   2168 }
   2169 
   2170 #define LOAD_DQCOEFF(reg, input)  \
   2171   {                               \
   2172     reg = load_input_data(input); \
   2173     input += 8;                   \
   2174   }
   2175 
   2176 #define IDCT32_34                                                              \
   2177   /* Stage1 */                                                                 \
   2178   {                                                                            \
   2179     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
   2180     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
   2181                                                                                \
   2182     const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
   2183     const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
   2184                                                                                \
   2185     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
   2186     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
   2187                                                                                \
   2188     const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
   2189     const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
   2190                                                                                \
   2191     MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
   2192                              stp1_31);                                         \
   2193     MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
   2194                              stp1_28);                                         \
   2195     MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
   2196                              stp1_27);                                         \
   2197     MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
   2198                              stp1_24);                                         \
   2199   }                                                                            \
   2200                                                                                \
   2201   /* Stage2 */                                                                 \
   2202   {                                                                            \
   2203     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
   2204     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
   2205                                                                                \
   2206     const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
   2207     const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
   2208                                                                                \
   2209     MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
   2210                              stp2_15);                                         \
   2211     MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
   2212                              stp2_12);                                         \
   2213                                                                                \
   2214     stp2_16 = stp1_16;                                                         \
   2215     stp2_19 = stp1_19;                                                         \
   2216                                                                                \
   2217     stp2_20 = stp1_20;                                                         \
   2218     stp2_23 = stp1_23;                                                         \
   2219                                                                                \
   2220     stp2_24 = stp1_24;                                                         \
   2221     stp2_27 = stp1_27;                                                         \
   2222                                                                                \
   2223     stp2_28 = stp1_28;                                                         \
   2224     stp2_31 = stp1_31;                                                         \
   2225   }                                                                            \
   2226                                                                                \
   2227   /* Stage3 */                                                                 \
   2228   {                                                                            \
   2229     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
   2230     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
   2231                                                                                \
   2232     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
   2233     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
   2234     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
   2235     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
   2236                                                                                \
   2237     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
   2238     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
   2239     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
   2240     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
   2241                                                                                \
   2242     MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
   2243                              stp1_7);                                          \
   2244                                                                                \
   2245     stp1_8 = stp2_8;                                                           \
   2246     stp1_11 = stp2_11;                                                         \
   2247     stp1_12 = stp2_12;                                                         \
   2248     stp1_15 = stp2_15;                                                         \
   2249                                                                                \
   2250     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
   2251                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
   2252                            stp1_29)                                            \
   2253     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
   2254                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
   2255                            stp1_25)                                            \
   2256                                                                                \
   2257     stp1_16 = stp2_16;                                                         \
   2258     stp1_31 = stp2_31;                                                         \
   2259     stp1_19 = stp2_19;                                                         \
   2260     stp1_20 = stp2_20;                                                         \
   2261     stp1_23 = stp2_23;                                                         \
   2262     stp1_24 = stp2_24;                                                         \
   2263     stp1_27 = stp2_27;                                                         \
   2264     stp1_28 = stp2_28;                                                         \
   2265   }                                                                            \
   2266                                                                                \
   2267   /* Stage4 */                                                                 \
   2268   {                                                                            \
   2269     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
   2270     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
   2271                                                                                \
   2272     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
   2273     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
   2274     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
   2275     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
   2276                                                                                \
   2277     MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
   2278                              stp2_1);                                          \
   2279                                                                                \
   2280     stp2_4 = stp1_4;                                                           \
   2281     stp2_5 = stp1_4;                                                           \
   2282     stp2_6 = stp1_7;                                                           \
   2283     stp2_7 = stp1_7;                                                           \
   2284                                                                                \
   2285     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
   2286                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
   2287                            stp2_13)                                            \
   2288                                                                                \
   2289     stp2_8 = stp1_8;                                                           \
   2290     stp2_15 = stp1_15;                                                         \
   2291     stp2_11 = stp1_11;                                                         \
   2292     stp2_12 = stp1_12;                                                         \
   2293                                                                                \
   2294     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
   2295     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
   2296     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
   2297     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
   2298     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
   2299     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
   2300     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
   2301     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
   2302                                                                                \
   2303     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
   2304     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
   2305     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
   2306     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
   2307     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
   2308     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
   2309     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
   2310     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
   2311   }                                                                            \
   2312                                                                                \
   2313   /* Stage5 */                                                                 \
   2314   {                                                                            \
   2315     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
   2316     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
   2317     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
   2318     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
   2319                                                                                \
   2320     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
   2321     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
   2322     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
   2323     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
   2324                                                                                \
   2325     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
   2326     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
   2327                                                                                \
   2328     stp1_0 = stp2_0;                                                           \
   2329     stp1_1 = stp2_1;                                                           \
   2330     stp1_2 = stp2_1;                                                           \
   2331     stp1_3 = stp2_0;                                                           \
   2332                                                                                \
   2333     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
   2334     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
   2335     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
   2336     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
   2337                                                                                \
   2338     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
   2339     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
   2340     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
   2341     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
   2342                                                                                \
   2343     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
   2344     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
   2345     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
   2346     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
   2347                                                                                \
   2348     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
   2349     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
   2350                                                                                \
   2351     stp1_4 = stp2_4;                                                           \
   2352     stp1_7 = stp2_7;                                                           \
   2353                                                                                \
   2354     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
   2355     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
   2356     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
   2357     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
   2358     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
   2359     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
   2360     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
   2361     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
   2362                                                                                \
   2363     stp1_16 = stp2_16;                                                         \
   2364     stp1_17 = stp2_17;                                                         \
   2365                                                                                \
   2366     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
   2367                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
   2368                            stp1_28)                                            \
   2369     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
   2370                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
   2371                            stp1_26)                                            \
   2372                                                                                \
   2373     stp1_22 = stp2_22;                                                         \
   2374     stp1_23 = stp2_23;                                                         \
   2375     stp1_24 = stp2_24;                                                         \
   2376     stp1_25 = stp2_25;                                                         \
   2377     stp1_30 = stp2_30;                                                         \
   2378     stp1_31 = stp2_31;                                                         \
   2379   }                                                                            \
   2380                                                                                \
   2381   /* Stage6 */                                                                 \
   2382   {                                                                            \
   2383     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
   2384     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
   2385     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
   2386     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
   2387                                                                                \
   2388     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
   2389     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
   2390     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
   2391     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
   2392     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
   2393     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
   2394     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
   2395     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
   2396                                                                                \
   2397     stp2_8 = stp1_8;                                                           \
   2398     stp2_9 = stp1_9;                                                           \
   2399     stp2_14 = stp1_14;                                                         \
   2400     stp2_15 = stp1_15;                                                         \
   2401                                                                                \
   2402     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
   2403                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
   2404                            stp2_12)                                            \
   2405                                                                                \
   2406     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
   2407     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
   2408     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
   2409     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
   2410     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
   2411     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
   2412     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
   2413     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
   2414                                                                                \
   2415     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
   2416     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
   2417     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
   2418     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
   2419     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
   2420     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
   2421     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
   2422     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
   2423   }                                                                            \
   2424                                                                                \
   2425   /* Stage7 */                                                                 \
   2426   {                                                                            \
   2427     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
   2428     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
   2429     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
   2430     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
   2431                                                                                \
   2432     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
   2433     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
   2434     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
   2435     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
   2436                                                                                \
   2437     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
   2438     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
   2439     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
   2440     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
   2441     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
   2442     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
   2443     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
   2444     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
   2445     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
   2446     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
   2447     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
   2448     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
   2449     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
   2450     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
   2451     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
   2452     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
   2453                                                                                \
   2454     stp1_16 = stp2_16;                                                         \
   2455     stp1_17 = stp2_17;                                                         \
   2456     stp1_18 = stp2_18;                                                         \
   2457     stp1_19 = stp2_19;                                                         \
   2458                                                                                \
   2459     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
   2460                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
   2461                            stp1_26)                                            \
   2462     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
   2463                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
   2464                            stp1_24)                                            \
   2465                                                                                \
   2466     stp1_28 = stp2_28;                                                         \
   2467     stp1_29 = stp2_29;                                                         \
   2468     stp1_30 = stp2_30;                                                         \
   2469     stp1_31 = stp2_31;                                                         \
   2470   }
   2471 
   2472 #define IDCT32                                                                 \
   2473   /* Stage1 */                                                                 \
   2474   {                                                                            \
   2475     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
   2476     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
   2477     const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
   2478     const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
   2479                                                                                \
   2480     const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
   2481     const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
   2482     const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
   2483     const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
   2484                                                                                \
   2485     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
   2486     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
   2487     const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
   2488     const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
   2489                                                                                \
   2490     const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
   2491     const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
   2492     const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
   2493     const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
   2494                                                                                \
   2495     MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
   2496                            stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
   2497                            stp1_30)                                            \
   2498     MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
   2499                            stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
   2500     MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
   2501                            stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
   2502                            stp1_21, stp1_26)                                   \
   2503     MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
   2504                            stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
   2505                            stp1_23, stp1_24)                                   \
   2506   }                                                                            \
   2507                                                                                \
   2508   /* Stage2 */                                                                 \
   2509   {                                                                            \
   2510     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
   2511     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
   2512     const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
   2513     const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
   2514                                                                                \
   2515     const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
   2516     const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
   2517     const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
   2518     const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
   2519                                                                                \
   2520     MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
   2521                            stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
   2522                            stp2_14)                                            \
   2523     MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
   2524                            stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
   2525                            stp2_12)                                            \
   2526                                                                                \
   2527     stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
   2528     stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
   2529     stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
   2530     stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
   2531                                                                                \
   2532     stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
   2533     stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
   2534     stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
   2535     stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
   2536                                                                                \
   2537     stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
   2538     stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
   2539     stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
   2540     stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
   2541                                                                                \
   2542     stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
   2543     stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
   2544     stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
   2545     stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
   2546   }                                                                            \
   2547                                                                                \
   2548   /* Stage3 */                                                                 \
   2549   {                                                                            \
   2550     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
   2551     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
   2552     const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
   2553     const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
   2554                                                                                \
   2555     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
   2556     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
   2557     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
   2558     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
   2559                                                                                \
   2560     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
   2561     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
   2562     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
   2563     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
   2564                                                                                \
   2565     MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
   2566                            stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
   2567                            stp1_6)                                             \
   2568                                                                                \
   2569     stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
   2570     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
   2571     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
   2572     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
   2573     stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
   2574     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
   2575     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
   2576     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
   2577                                                                                \
   2578     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
   2579                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
   2580                            stp1_29)                                            \
   2581     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
   2582                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
   2583                            stp1_25)                                            \
   2584                                                                                \
   2585     stp1_16 = stp2_16;                                                         \
   2586     stp1_31 = stp2_31;                                                         \
   2587     stp1_19 = stp2_19;                                                         \
   2588     stp1_20 = stp2_20;                                                         \
   2589     stp1_23 = stp2_23;                                                         \
   2590     stp1_24 = stp2_24;                                                         \
   2591     stp1_27 = stp2_27;                                                         \
   2592     stp1_28 = stp2_28;                                                         \
   2593   }                                                                            \
   2594                                                                                \
   2595   /* Stage4 */                                                                 \
   2596   {                                                                            \
   2597     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
   2598     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
   2599     const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
   2600     const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
   2601                                                                                \
   2602     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
   2603     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
   2604     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
   2605     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
   2606                                                                                \
   2607     MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
   2608                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
   2609                                                                                \
   2610     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
   2611     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
   2612     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
   2613     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
   2614                                                                                \
   2615     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
   2616                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
   2617                            stp2_13)                                            \
   2618                                                                                \
   2619     stp2_8 = stp1_8;                                                           \
   2620     stp2_15 = stp1_15;                                                         \
   2621     stp2_11 = stp1_11;                                                         \
   2622     stp2_12 = stp1_12;                                                         \
   2623                                                                                \
   2624     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
   2625     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
   2626     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
   2627     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
   2628     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
   2629     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
   2630     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
   2631     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
   2632                                                                                \
   2633     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
   2634     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
   2635     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
   2636     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
   2637     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
   2638     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
   2639     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
   2640     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
   2641   }                                                                            \
   2642                                                                                \
   2643   /* Stage5 */                                                                 \
   2644   {                                                                            \
   2645     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
   2646     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
   2647     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
   2648     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
   2649                                                                                \
   2650     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
   2651     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
   2652     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
   2653     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
   2654                                                                                \
   2655     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
   2656     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
   2657                                                                                \
   2658     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
   2659     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
   2660     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
   2661     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
   2662                                                                                \
   2663     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
   2664     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
   2665     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
   2666     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
   2667                                                                                \
   2668     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
   2669     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
   2670     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
   2671     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
   2672                                                                                \
   2673     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
   2674     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
   2675     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
   2676     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
   2677                                                                                \
   2678     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
   2679     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
   2680                                                                                \
   2681     stp1_4 = stp2_4;                                                           \
   2682     stp1_7 = stp2_7;                                                           \
   2683                                                                                \
   2684     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
   2685     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
   2686     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
   2687     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
   2688     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
   2689     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
   2690     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
   2691     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
   2692                                                                                \
   2693     stp1_16 = stp2_16;                                                         \
   2694     stp1_17 = stp2_17;                                                         \
   2695                                                                                \
   2696     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
   2697                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
   2698                            stp1_28)                                            \
   2699     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
   2700                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
   2701                            stp1_26)                                            \
   2702                                                                                \
   2703     stp1_22 = stp2_22;                                                         \
   2704     stp1_23 = stp2_23;                                                         \
   2705     stp1_24 = stp2_24;                                                         \
   2706     stp1_25 = stp2_25;                                                         \
   2707     stp1_30 = stp2_30;                                                         \
   2708     stp1_31 = stp2_31;                                                         \
   2709   }                                                                            \
   2710                                                                                \
   2711   /* Stage6 */                                                                 \
   2712   {                                                                            \
   2713     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
   2714     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
   2715     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
   2716     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
   2717                                                                                \
   2718     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
   2719     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
   2720     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
   2721     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
   2722     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
   2723     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
   2724     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
   2725     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
   2726                                                                                \
   2727     stp2_8 = stp1_8;                                                           \
   2728     stp2_9 = stp1_9;                                                           \
   2729     stp2_14 = stp1_14;                                                         \
   2730     stp2_15 = stp1_15;                                                         \
   2731                                                                                \
   2732     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
   2733                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
   2734                            stp2_12)                                            \
   2735                                                                                \
   2736     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
   2737     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
   2738     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
   2739     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
   2740     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
   2741     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
   2742     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
   2743     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
   2744                                                                                \
   2745     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
   2746     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
   2747     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
   2748     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
   2749     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
   2750     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
   2751     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
   2752     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
   2753   }                                                                            \
   2754                                                                                \
   2755   /* Stage7 */                                                                 \
   2756   {                                                                            \
   2757     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
   2758     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
   2759     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
   2760     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
   2761                                                                                \
   2762     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
   2763     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
   2764     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
   2765     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
   2766                                                                                \
   2767     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
   2768     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
   2769     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
   2770     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
   2771     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
   2772     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
   2773     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
   2774     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
   2775     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
   2776     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
   2777     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
   2778     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
   2779     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
   2780     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
   2781     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
   2782     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
   2783                                                                                \
   2784     stp1_16 = stp2_16;                                                         \
   2785     stp1_17 = stp2_17;                                                         \
   2786     stp1_18 = stp2_18;                                                         \
   2787     stp1_19 = stp2_19;                                                         \
   2788                                                                                \
   2789     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
   2790                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
   2791                            stp1_26)                                            \
   2792     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
   2793                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
   2794                            stp1_24)                                            \
   2795                                                                                \
   2796     stp1_28 = stp2_28;                                                         \
   2797     stp1_29 = stp2_29;                                                         \
   2798     stp1_30 = stp2_30;                                                         \
   2799     stp1_31 = stp2_31;                                                         \
   2800   }
   2801 
   2802 // Only upper-left 8x8 has non-zero coeff
   2803 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
   2804                                int stride) {
   2805   const __m128i zero = _mm_setzero_si128();
   2806   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   2807   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   2808 
   2809   // idct constants for each stage
   2810   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   2811   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
   2812   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   2813   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
   2814   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   2815   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
   2816   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   2817   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
   2818 
   2819   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   2820   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   2821   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   2822   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   2823 
   2824   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   2825   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   2826   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   2827   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
   2828   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
   2829   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   2830   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
   2831   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
   2832 
   2833   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   2834   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   2835   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   2836   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   2837   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   2838 
   2839   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   2840 
   2841   __m128i in[32], col[32];
   2842   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
   2843       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
   2844       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
   2845       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
   2846   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   2847       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
   2848       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
   2849       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
   2850   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   2851   int i;
   2852 
   2853   // Load input data. Only need to load the top left 8x8 block.
   2854   in[0] = load_input_data(input);
   2855   in[1] = load_input_data(input + 32);
   2856   in[2] = load_input_data(input + 64);
   2857   in[3] = load_input_data(input + 96);
   2858   in[4] = load_input_data(input + 128);
   2859   in[5] = load_input_data(input + 160);
   2860   in[6] = load_input_data(input + 192);
   2861   in[7] = load_input_data(input + 224);
   2862 
   2863   array_transpose_8x8(in, in);
   2864   IDCT32_34
   2865 
   2866   // 1_D: Store 32 intermediate results for each 8x32 block.
   2867   col[0] = _mm_add_epi16(stp1_0, stp1_31);
   2868   col[1] = _mm_add_epi16(stp1_1, stp1_30);
   2869   col[2] = _mm_add_epi16(stp1_2, stp1_29);
   2870   col[3] = _mm_add_epi16(stp1_3, stp1_28);
   2871   col[4] = _mm_add_epi16(stp1_4, stp1_27);
   2872   col[5] = _mm_add_epi16(stp1_5, stp1_26);
   2873   col[6] = _mm_add_epi16(stp1_6, stp1_25);
   2874   col[7] = _mm_add_epi16(stp1_7, stp1_24);
   2875   col[8] = _mm_add_epi16(stp1_8, stp1_23);
   2876   col[9] = _mm_add_epi16(stp1_9, stp1_22);
   2877   col[10] = _mm_add_epi16(stp1_10, stp1_21);
   2878   col[11] = _mm_add_epi16(stp1_11, stp1_20);
   2879   col[12] = _mm_add_epi16(stp1_12, stp1_19);
   2880   col[13] = _mm_add_epi16(stp1_13, stp1_18);
   2881   col[14] = _mm_add_epi16(stp1_14, stp1_17);
   2882   col[15] = _mm_add_epi16(stp1_15, stp1_16);
   2883   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
   2884   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
   2885   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
   2886   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
   2887   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
   2888   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
   2889   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
   2890   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
   2891   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
   2892   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
   2893   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
   2894   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
   2895   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
   2896   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
   2897   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
   2898   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
   2899   for (i = 0; i < 4; i++) {
   2900     int j;
   2901     // Transpose 32x8 block to 8x32 block
   2902     array_transpose_8x8(col + i * 8, in);
   2903     IDCT32_34
   2904 
   2905     // 2_D: Calculate the results and store them to destination.
   2906     in[0] = _mm_add_epi16(stp1_0, stp1_31);
   2907     in[1] = _mm_add_epi16(stp1_1, stp1_30);
   2908     in[2] = _mm_add_epi16(stp1_2, stp1_29);
   2909     in[3] = _mm_add_epi16(stp1_3, stp1_28);
   2910     in[4] = _mm_add_epi16(stp1_4, stp1_27);
   2911     in[5] = _mm_add_epi16(stp1_5, stp1_26);
   2912     in[6] = _mm_add_epi16(stp1_6, stp1_25);
   2913     in[7] = _mm_add_epi16(stp1_7, stp1_24);
   2914     in[8] = _mm_add_epi16(stp1_8, stp1_23);
   2915     in[9] = _mm_add_epi16(stp1_9, stp1_22);
   2916     in[10] = _mm_add_epi16(stp1_10, stp1_21);
   2917     in[11] = _mm_add_epi16(stp1_11, stp1_20);
   2918     in[12] = _mm_add_epi16(stp1_12, stp1_19);
   2919     in[13] = _mm_add_epi16(stp1_13, stp1_18);
   2920     in[14] = _mm_add_epi16(stp1_14, stp1_17);
   2921     in[15] = _mm_add_epi16(stp1_15, stp1_16);
   2922     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
   2923     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
   2924     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
   2925     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
   2926     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
   2927     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
   2928     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
   2929     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
   2930     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
   2931     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
   2932     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
   2933     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
   2934     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
   2935     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
   2936     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
   2937     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
   2938 
   2939     for (j = 0; j < 32; ++j) {
   2940       // Final rounding and shift
   2941       in[j] = _mm_adds_epi16(in[j], final_rounding);
   2942       in[j] = _mm_srai_epi16(in[j], 6);
   2943       RECON_AND_STORE(dest + j * stride, in[j]);
   2944     }
   2945 
   2946     dest += 8;
   2947   }
   2948 }
   2949 
   2950 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
   2951                                  int stride) {
   2952   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   2953   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   2954   const __m128i zero = _mm_setzero_si128();
   2955 
   2956   // idct constants for each stage
   2957   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   2958   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
   2959   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   2960   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
   2961   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   2962   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
   2963   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   2964   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
   2965   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   2966   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
   2967   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   2968   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   2969   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   2970   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
   2971   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   2972   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
   2973 
   2974   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   2975   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   2976   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
   2977   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
   2978   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
   2979   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
   2980   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   2981   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   2982 
   2983   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   2984   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   2985   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   2986   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
   2987   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   2988   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
   2989   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
   2990   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   2991   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
   2992   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
   2993 
   2994   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   2995   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   2996   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   2997   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   2998   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   2999   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   3000   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   3001 
   3002   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   3003 
   3004   __m128i in[32], col[128], zero_idx[16];
   3005   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
   3006       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
   3007       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
   3008       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
   3009   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   3010       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
   3011       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
   3012       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
   3013   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   3014   int i, j, i32;
   3015 
   3016   for (i = 0; i < 4; i++) {
   3017     i32 = (i << 5);
   3018     // First 1-D idct
   3019     // Load input data.
   3020     LOAD_DQCOEFF(in[0], input);
   3021     LOAD_DQCOEFF(in[8], input);
   3022     LOAD_DQCOEFF(in[16], input);
   3023     LOAD_DQCOEFF(in[24], input);
   3024     LOAD_DQCOEFF(in[1], input);
   3025     LOAD_DQCOEFF(in[9], input);
   3026     LOAD_DQCOEFF(in[17], input);
   3027     LOAD_DQCOEFF(in[25], input);
   3028     LOAD_DQCOEFF(in[2], input);
   3029     LOAD_DQCOEFF(in[10], input);
   3030     LOAD_DQCOEFF(in[18], input);
   3031     LOAD_DQCOEFF(in[26], input);
   3032     LOAD_DQCOEFF(in[3], input);
   3033     LOAD_DQCOEFF(in[11], input);
   3034     LOAD_DQCOEFF(in[19], input);
   3035     LOAD_DQCOEFF(in[27], input);
   3036 
   3037     LOAD_DQCOEFF(in[4], input);
   3038     LOAD_DQCOEFF(in[12], input);
   3039     LOAD_DQCOEFF(in[20], input);
   3040     LOAD_DQCOEFF(in[28], input);
   3041     LOAD_DQCOEFF(in[5], input);
   3042     LOAD_DQCOEFF(in[13], input);
   3043     LOAD_DQCOEFF(in[21], input);
   3044     LOAD_DQCOEFF(in[29], input);
   3045     LOAD_DQCOEFF(in[6], input);
   3046     LOAD_DQCOEFF(in[14], input);
   3047     LOAD_DQCOEFF(in[22], input);
   3048     LOAD_DQCOEFF(in[30], input);
   3049     LOAD_DQCOEFF(in[7], input);
   3050     LOAD_DQCOEFF(in[15], input);
   3051     LOAD_DQCOEFF(in[23], input);
   3052     LOAD_DQCOEFF(in[31], input);
   3053 
   3054     // checking if all entries are zero
   3055     zero_idx[0] = _mm_or_si128(in[0], in[1]);
   3056     zero_idx[1] = _mm_or_si128(in[2], in[3]);
   3057     zero_idx[2] = _mm_or_si128(in[4], in[5]);
   3058     zero_idx[3] = _mm_or_si128(in[6], in[7]);
   3059     zero_idx[4] = _mm_or_si128(in[8], in[9]);
   3060     zero_idx[5] = _mm_or_si128(in[10], in[11]);
   3061     zero_idx[6] = _mm_or_si128(in[12], in[13]);
   3062     zero_idx[7] = _mm_or_si128(in[14], in[15]);
   3063     zero_idx[8] = _mm_or_si128(in[16], in[17]);
   3064     zero_idx[9] = _mm_or_si128(in[18], in[19]);
   3065     zero_idx[10] = _mm_or_si128(in[20], in[21]);
   3066     zero_idx[11] = _mm_or_si128(in[22], in[23]);
   3067     zero_idx[12] = _mm_or_si128(in[24], in[25]);
   3068     zero_idx[13] = _mm_or_si128(in[26], in[27]);
   3069     zero_idx[14] = _mm_or_si128(in[28], in[29]);
   3070     zero_idx[15] = _mm_or_si128(in[30], in[31]);
   3071 
   3072     zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
   3073     zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
   3074     zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
   3075     zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
   3076     zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
   3077     zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
   3078     zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
   3079     zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
   3080 
   3081     zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
   3082     zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
   3083     zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
   3084     zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
   3085     zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
   3086     zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
   3087     zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
   3088 
   3089     if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
   3090       col[i32 + 0] = _mm_setzero_si128();
   3091       col[i32 + 1] = _mm_setzero_si128();
   3092       col[i32 + 2] = _mm_setzero_si128();
   3093       col[i32 + 3] = _mm_setzero_si128();
   3094       col[i32 + 4] = _mm_setzero_si128();
   3095       col[i32 + 5] = _mm_setzero_si128();
   3096       col[i32 + 6] = _mm_setzero_si128();
   3097       col[i32 + 7] = _mm_setzero_si128();
   3098       col[i32 + 8] = _mm_setzero_si128();
   3099       col[i32 + 9] = _mm_setzero_si128();
   3100       col[i32 + 10] = _mm_setzero_si128();
   3101       col[i32 + 11] = _mm_setzero_si128();
   3102       col[i32 + 12] = _mm_setzero_si128();
   3103       col[i32 + 13] = _mm_setzero_si128();
   3104       col[i32 + 14] = _mm_setzero_si128();
   3105       col[i32 + 15] = _mm_setzero_si128();
   3106       col[i32 + 16] = _mm_setzero_si128();
   3107       col[i32 + 17] = _mm_setzero_si128();
   3108       col[i32 + 18] = _mm_setzero_si128();
   3109       col[i32 + 19] = _mm_setzero_si128();
   3110       col[i32 + 20] = _mm_setzero_si128();
   3111       col[i32 + 21] = _mm_setzero_si128();
   3112       col[i32 + 22] = _mm_setzero_si128();
   3113       col[i32 + 23] = _mm_setzero_si128();
   3114       col[i32 + 24] = _mm_setzero_si128();
   3115       col[i32 + 25] = _mm_setzero_si128();
   3116       col[i32 + 26] = _mm_setzero_si128();
   3117       col[i32 + 27] = _mm_setzero_si128();
   3118       col[i32 + 28] = _mm_setzero_si128();
   3119       col[i32 + 29] = _mm_setzero_si128();
   3120       col[i32 + 30] = _mm_setzero_si128();
   3121       col[i32 + 31] = _mm_setzero_si128();
   3122       continue;
   3123     }
   3124 
   3125     // Transpose 32x8 block to 8x32 block
   3126     array_transpose_8x8(in, in);
   3127     array_transpose_8x8(in + 8, in + 8);
   3128     array_transpose_8x8(in + 16, in + 16);
   3129     array_transpose_8x8(in + 24, in + 24);
   3130 
   3131     IDCT32
   3132 
   3133     // 1_D: Store 32 intermediate results for each 8x32 block.
   3134     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
   3135     col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
   3136     col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
   3137     col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
   3138     col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
   3139     col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
   3140     col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
   3141     col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
   3142     col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
   3143     col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
   3144     col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
   3145     col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
   3146     col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
   3147     col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
   3148     col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
   3149     col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
   3150     col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
   3151     col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
   3152     col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
   3153     col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
   3154     col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
   3155     col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
   3156     col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
   3157     col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
   3158     col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
   3159     col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
   3160     col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
   3161     col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
   3162     col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
   3163     col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
   3164     col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
   3165     col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
   3166   }
   3167   for (i = 0; i < 4; i++) {
   3168     // Second 1-D idct
   3169     j = i << 3;
   3170 
   3171     // Transpose 32x8 block to 8x32 block
   3172     array_transpose_8x8(col + j, in);
   3173     array_transpose_8x8(col + j + 32, in + 8);
   3174     array_transpose_8x8(col + j + 64, in + 16);
   3175     array_transpose_8x8(col + j + 96, in + 24);
   3176 
   3177     IDCT32
   3178 
   3179     // 2_D: Calculate the results and store them to destination.
   3180     in[0] = _mm_add_epi16(stp1_0, stp1_31);
   3181     in[1] = _mm_add_epi16(stp1_1, stp1_30);
   3182     in[2] = _mm_add_epi16(stp1_2, stp1_29);
   3183     in[3] = _mm_add_epi16(stp1_3, stp1_28);
   3184     in[4] = _mm_add_epi16(stp1_4, stp1_27);
   3185     in[5] = _mm_add_epi16(stp1_5, stp1_26);
   3186     in[6] = _mm_add_epi16(stp1_6, stp1_25);
   3187     in[7] = _mm_add_epi16(stp1_7, stp1_24);
   3188     in[8] = _mm_add_epi16(stp1_8, stp1_23);
   3189     in[9] = _mm_add_epi16(stp1_9, stp1_22);
   3190     in[10] = _mm_add_epi16(stp1_10, stp1_21);
   3191     in[11] = _mm_add_epi16(stp1_11, stp1_20);
   3192     in[12] = _mm_add_epi16(stp1_12, stp1_19);
   3193     in[13] = _mm_add_epi16(stp1_13, stp1_18);
   3194     in[14] = _mm_add_epi16(stp1_14, stp1_17);
   3195     in[15] = _mm_add_epi16(stp1_15, stp1_16);
   3196     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
   3197     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
   3198     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
   3199     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
   3200     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
   3201     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
   3202     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
   3203     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
   3204     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
   3205     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
   3206     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
   3207     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
   3208     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
   3209     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
   3210     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
   3211     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
   3212 
   3213     for (j = 0; j < 32; ++j) {
   3214       // Final rounding and shift
   3215       in[j] = _mm_adds_epi16(in[j], final_rounding);
   3216       in[j] = _mm_srai_epi16(in[j], 6);
   3217       RECON_AND_STORE(dest + j * stride, in[j]);
   3218     }
   3219 
   3220     dest += 8;
   3221   }
   3222 }
   3223 
   3224 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   3225                               int stride) {
   3226   __m128i dc_value;
   3227   const __m128i zero = _mm_setzero_si128();
   3228   int a, j;
   3229 
   3230   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
   3231   a = (int)dct_const_round_shift(a * cospi_16_64);
   3232   a = ROUND_POWER_OF_TWO(a, 6);
   3233 
   3234   dc_value = _mm_set1_epi16(a);
   3235 
   3236   for (j = 0; j < 32; ++j) {
   3237     RECON_AND_STORE(dest + 0 + j * stride, dc_value);
   3238     RECON_AND_STORE(dest + 8 + j * stride, dc_value);
   3239     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
   3240     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
   3241   }
   3242 }
   3243