Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <emmintrin.h>  // SSE2
     13 #include "./vpx_config.h"
     14 #include "vpx/vpx_integer.h"
     15 #include "vp9/common/vp9_common.h"
     16 #include "vp9/common/vp9_idct.h"
     17 
     18 #define RECON_AND_STORE4X4(dest, in_x) \
     19 {                                                     \
     20   __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
     21   d0 = _mm_unpacklo_epi8(d0, zero); \
     22   d0 = _mm_add_epi16(in_x, d0); \
     23   d0 = _mm_packus_epi16(d0, d0); \
     24   *(int *)dest = _mm_cvtsi128_si32(d0); \
     25   dest += stride; \
     26 }
     27 
     28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     29   const __m128i zero = _mm_setzero_si128();
     30   const __m128i eight = _mm_set1_epi16(8);
     31   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
     32                                     (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
     33                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
     34                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
     35   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     36   __m128i input0, input1, input2, input3;
     37 
     38   // Rows
     39   input0 = _mm_load_si128((const __m128i *)input);
     40   input2 = _mm_load_si128((const __m128i *)(input + 8));
     41 
     42   // Construct i3, i1, i3, i1, i2, i0, i2, i0
     43   input0 = _mm_shufflelo_epi16(input0, 0xd8);
     44   input0 = _mm_shufflehi_epi16(input0, 0xd8);
     45   input2 = _mm_shufflelo_epi16(input2, 0xd8);
     46   input2 = _mm_shufflehi_epi16(input2, 0xd8);
     47 
     48   input1 = _mm_unpackhi_epi32(input0, input0);
     49   input0 = _mm_unpacklo_epi32(input0, input0);
     50   input3 = _mm_unpackhi_epi32(input2, input2);
     51   input2 = _mm_unpacklo_epi32(input2, input2);
     52 
     53   // Stage 1
     54   input0 = _mm_madd_epi16(input0, cst);
     55   input1 = _mm_madd_epi16(input1, cst);
     56   input2 = _mm_madd_epi16(input2, cst);
     57   input3 = _mm_madd_epi16(input3, cst);
     58 
     59   input0 = _mm_add_epi32(input0, rounding);
     60   input1 = _mm_add_epi32(input1, rounding);
     61   input2 = _mm_add_epi32(input2, rounding);
     62   input3 = _mm_add_epi32(input3, rounding);
     63 
     64   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
     65   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
     66   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
     67   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
     68 
     69   // Stage 2
     70   input0 = _mm_packs_epi32(input0, input1);
     71   input1 = _mm_packs_epi32(input2, input3);
     72 
     73   // Transpose
     74   input2 = _mm_unpacklo_epi16(input0, input1);
     75   input3 = _mm_unpackhi_epi16(input0, input1);
     76   input0 = _mm_unpacklo_epi32(input2, input3);
     77   input1 = _mm_unpackhi_epi32(input2, input3);
     78 
     79   // Switch column2, column 3, and then, we got:
     80   // input2: column1, column 0;  input3: column2, column 3.
     81   input1 = _mm_shuffle_epi32(input1, 0x4e);
     82   input2 = _mm_add_epi16(input0, input1);
     83   input3 = _mm_sub_epi16(input0, input1);
     84 
     85   // Columns
     86   // Construct i3, i1, i3, i1, i2, i0, i2, i0
     87   input0 = _mm_unpacklo_epi32(input2, input2);
     88   input1 = _mm_unpackhi_epi32(input2, input2);
     89   input2 = _mm_unpackhi_epi32(input3, input3);
     90   input3 = _mm_unpacklo_epi32(input3, input3);
     91 
     92   // Stage 1
     93   input0 = _mm_madd_epi16(input0, cst);
     94   input1 = _mm_madd_epi16(input1, cst);
     95   input2 = _mm_madd_epi16(input2, cst);
     96   input3 = _mm_madd_epi16(input3, cst);
     97 
     98   input0 = _mm_add_epi32(input0, rounding);
     99   input1 = _mm_add_epi32(input1, rounding);
    100   input2 = _mm_add_epi32(input2, rounding);
    101   input3 = _mm_add_epi32(input3, rounding);
    102 
    103   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
    104   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
    105   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
    106   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
    107 
    108   // Stage 2
    109   input0 = _mm_packs_epi32(input0, input2);
    110   input1 = _mm_packs_epi32(input1, input3);
    111 
    112   // Transpose
    113   input2 = _mm_unpacklo_epi16(input0, input1);
    114   input3 = _mm_unpackhi_epi16(input0, input1);
    115   input0 = _mm_unpacklo_epi32(input2, input3);
    116   input1 = _mm_unpackhi_epi32(input2, input3);
    117 
    118   // Switch column2, column 3, and then, we got:
    119   // input2: column1, column 0;  input3: column2, column 3.
    120   input1 = _mm_shuffle_epi32(input1, 0x4e);
    121   input2 = _mm_add_epi16(input0, input1);
    122   input3 = _mm_sub_epi16(input0, input1);
    123 
    124   // Final round and shift
    125   input2 = _mm_add_epi16(input2, eight);
    126   input3 = _mm_add_epi16(input3, eight);
    127 
    128   input2 = _mm_srai_epi16(input2, 4);
    129   input3 = _mm_srai_epi16(input3, 4);
    130 
    131   // Reconstruction and Store
    132   {
    133      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
    134      __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
    135      d0 = _mm_unpacklo_epi32(d0,
    136           _mm_cvtsi32_si128(*(const int *) (dest + stride)));
    137      d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
    138                     *(const int *) (dest + stride * 3)), d2);
    139      d0 = _mm_unpacklo_epi8(d0, zero);
    140      d2 = _mm_unpacklo_epi8(d2, zero);
    141      d0 = _mm_add_epi16(d0, input2);
    142      d2 = _mm_add_epi16(d2, input3);
    143      d0 = _mm_packus_epi16(d0, d2);
    144      // store input0
    145      *(int *)dest = _mm_cvtsi128_si32(d0);
    146      // store input1
    147      d0 = _mm_srli_si128(d0, 4);
    148      *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
    149      // store input2
    150      d0 = _mm_srli_si128(d0, 4);
    151      *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
    152      // store input3
    153      d0 = _mm_srli_si128(d0, 4);
    154      *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
    155   }
    156 }
    157 
    158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    159   __m128i dc_value;
    160   const __m128i zero = _mm_setzero_si128();
    161   int a;
    162 
    163   a = dct_const_round_shift(input[0] * cospi_16_64);
    164   a = dct_const_round_shift(a * cospi_16_64);
    165   a = ROUND_POWER_OF_TWO(a, 4);
    166 
    167   dc_value = _mm_set1_epi16(a);
    168 
    169   RECON_AND_STORE4X4(dest, dc_value);
    170   RECON_AND_STORE4X4(dest, dc_value);
    171   RECON_AND_STORE4X4(dest, dc_value);
    172   RECON_AND_STORE4X4(dest, dc_value);
    173 }
    174 
    175 static INLINE void transpose_4x4(__m128i *res) {
    176   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
    177   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
    178 
    179   res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
    180   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
    181 }
    182 
    183 static void idct4_sse2(__m128i *in) {
    184   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
    185   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    186   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    187   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    188   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    189   __m128i u[8], v[8];
    190 
    191   transpose_4x4(in);
    192   // stage 1
    193   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
    194   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
    195   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    196   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
    197   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
    198   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
    199 
    200   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    201   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    202   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    203   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    204 
    205   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    206   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    207   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    208   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    209 
    210   u[0] = _mm_packs_epi32(v[0], v[1]);
    211   u[1] = _mm_packs_epi32(v[3], v[2]);
    212 
    213   // stage 2
    214   in[0] = _mm_add_epi16(u[0], u[1]);
    215   in[1] = _mm_sub_epi16(u[0], u[1]);
    216   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
    217 }
    218 
    219 static void iadst4_sse2(__m128i *in) {
    220   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
    221   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
    222   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
    223   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
    224   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
    225   const __m128i kZero = _mm_set1_epi16(0);
    226   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    227   __m128i u[8], v[8], in7;
    228 
    229   transpose_4x4(in);
    230   in7 = _mm_srli_si128(in[1], 8);
    231   in7 = _mm_add_epi16(in7, in[0]);
    232   in7 = _mm_sub_epi16(in7, in[1]);
    233 
    234   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
    235   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
    236   u[2] = _mm_unpacklo_epi16(in7, kZero);
    237   u[3] = _mm_unpackhi_epi16(in[0], kZero);
    238 
    239   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
    240   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
    241   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
    242   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
    243   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
    244   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
    245 
    246   u[0] = _mm_add_epi32(v[0], v[1]);
    247   u[1] = _mm_add_epi32(v[3], v[4]);
    248   u[2] = v[2];
    249   u[3] = _mm_add_epi32(u[0], u[1]);
    250   u[4] = _mm_slli_epi32(v[5], 2);
    251   u[5] = _mm_add_epi32(u[3], v[5]);
    252   u[6] = _mm_sub_epi32(u[5], u[4]);
    253 
    254   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    255   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    256   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    257   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    258 
    259   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    260   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    261   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    262   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    263 
    264   in[0] = _mm_packs_epi32(u[0], u[1]);
    265   in[1] = _mm_packs_epi32(u[2], u[3]);
    266 }
    267 
    268 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
    269                             int tx_type) {
    270   __m128i in[2];
    271   const __m128i zero = _mm_setzero_si128();
    272   const __m128i eight = _mm_set1_epi16(8);
    273 
    274   in[0]= _mm_loadu_si128((const __m128i *)(input));
    275   in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
    276 
    277   switch (tx_type) {
    278     case 0:  // DCT_DCT
    279       idct4_sse2(in);
    280       idct4_sse2(in);
    281       break;
    282     case 1:  // ADST_DCT
    283       idct4_sse2(in);
    284       iadst4_sse2(in);
    285       break;
    286     case 2:  // DCT_ADST
    287       iadst4_sse2(in);
    288       idct4_sse2(in);
    289       break;
    290     case 3:  // ADST_ADST
    291       iadst4_sse2(in);
    292       iadst4_sse2(in);
    293       break;
    294     default:
    295       assert(0);
    296       break;
    297   }
    298 
    299   // Final round and shift
    300   in[0] = _mm_add_epi16(in[0], eight);
    301   in[1] = _mm_add_epi16(in[1], eight);
    302 
    303   in[0] = _mm_srai_epi16(in[0], 4);
    304   in[1] = _mm_srai_epi16(in[1], 4);
    305 
    306   // Reconstruction and Store
    307   {
    308      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
    309      __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
    310      d0 = _mm_unpacklo_epi32(d0,
    311           _mm_cvtsi32_si128(*(const int *) (dest + stride)));
    312      d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
    313                     *(const int *) (dest + stride * 3)));
    314      d0 = _mm_unpacklo_epi8(d0, zero);
    315      d2 = _mm_unpacklo_epi8(d2, zero);
    316      d0 = _mm_add_epi16(d0, in[0]);
    317      d2 = _mm_add_epi16(d2, in[1]);
    318      d0 = _mm_packus_epi16(d0, d2);
    319      // store result[0]
    320      *(int *)dest = _mm_cvtsi128_si32(d0);
    321      // store result[1]
    322      d0 = _mm_srli_si128(d0, 4);
    323      *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
    324      // store result[2]
    325      d0 = _mm_srli_si128(d0, 4);
    326      *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
    327      // store result[3]
    328      d0 = _mm_srli_si128(d0, 4);
    329      *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
    330   }
    331 }
    332 
    333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
    334                       out0, out1, out2, out3, out4, out5, out6, out7) \
    335   {                                                     \
    336     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
    337     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
    338     const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
    339     const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
    340     const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
    341     const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
    342     const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
    343     const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
    344                                                         \
    345     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
    346     const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
    347     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
    348     const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
    349     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
    350     const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
    351     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
    352     const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
    353                                                             \
    354     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
    355     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
    356     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
    357     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
    358     out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
    359     out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
    360     out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
    361     out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
    362   }
    363 
    364 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
    365                          out0, out1, out2, out3) \
    366   {                                              \
    367     const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
    368     const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
    369     const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
    370     const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
    371     \
    372     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
    373     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
    374     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
    375     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
    376     \
    377     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
    378     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
    379     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
    380     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
    381   }
    382 
    383 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
    384   {                                                     \
    385     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
    386     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
    387                                                         \
    388     in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
    389     in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
    390   }
    391 
    392 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
    393   {                                            \
    394     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
    395     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
    396     out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
    397     out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
    398   }
    399 
    400 // Define Macro for multiplying elements by constants and adding them together.
    401 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
    402                                cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
    403   {   \
    404       tmp0 = _mm_madd_epi16(lo_0, cst0); \
    405       tmp1 = _mm_madd_epi16(hi_0, cst0); \
    406       tmp2 = _mm_madd_epi16(lo_0, cst1); \
    407       tmp3 = _mm_madd_epi16(hi_0, cst1); \
    408       tmp4 = _mm_madd_epi16(lo_1, cst2); \
    409       tmp5 = _mm_madd_epi16(hi_1, cst2); \
    410       tmp6 = _mm_madd_epi16(lo_1, cst3); \
    411       tmp7 = _mm_madd_epi16(hi_1, cst3); \
    412       \
    413       tmp0 = _mm_add_epi32(tmp0, rounding); \
    414       tmp1 = _mm_add_epi32(tmp1, rounding); \
    415       tmp2 = _mm_add_epi32(tmp2, rounding); \
    416       tmp3 = _mm_add_epi32(tmp3, rounding); \
    417       tmp4 = _mm_add_epi32(tmp4, rounding); \
    418       tmp5 = _mm_add_epi32(tmp5, rounding); \
    419       tmp6 = _mm_add_epi32(tmp6, rounding); \
    420       tmp7 = _mm_add_epi32(tmp7, rounding); \
    421       \
    422       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
    423       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
    424       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
    425       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
    426       tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
    427       tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
    428       tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
    429       tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
    430       \
    431       res0 = _mm_packs_epi32(tmp0, tmp1); \
    432       res1 = _mm_packs_epi32(tmp2, tmp3); \
    433       res2 = _mm_packs_epi32(tmp4, tmp5); \
    434       res3 = _mm_packs_epi32(tmp6, tmp7); \
    435   }
    436 
    437 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
    438   {   \
    439       tmp0 = _mm_madd_epi16(lo_0, cst0); \
    440       tmp1 = _mm_madd_epi16(hi_0, cst0); \
    441       tmp2 = _mm_madd_epi16(lo_0, cst1); \
    442       tmp3 = _mm_madd_epi16(hi_0, cst1); \
    443       \
    444       tmp0 = _mm_add_epi32(tmp0, rounding); \
    445       tmp1 = _mm_add_epi32(tmp1, rounding); \
    446       tmp2 = _mm_add_epi32(tmp2, rounding); \
    447       tmp3 = _mm_add_epi32(tmp3, rounding); \
    448       \
    449       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
    450       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
    451       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
    452       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
    453       \
    454       res0 = _mm_packs_epi32(tmp0, tmp1); \
    455       res1 = _mm_packs_epi32(tmp2, tmp3); \
    456   }
    457 
    458 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
    459                  out0, out1, out2, out3, out4, out5, out6, out7)  \
    460   { \
    461   /* Stage1 */      \
    462   { \
    463     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
    464     const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
    465     const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
    466     const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
    467     \
    468     MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
    469                           stg1_1, stg1_2, stg1_3, stp1_4,      \
    470                           stp1_7, stp1_5, stp1_6)              \
    471   } \
    472     \
    473   /* Stage2 */ \
    474   { \
    475     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
    476     const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
    477     const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
    478     const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
    479     \
    480     MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
    481                            stg2_1, stg2_2, stg2_3, stp2_0,     \
    482                            stp2_1, stp2_2, stp2_3)             \
    483     \
    484     stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
    485     stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
    486     stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
    487     stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
    488   } \
    489     \
    490   /* Stage3 */ \
    491   { \
    492     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
    493     const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
    494     \
    495     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
    496     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
    497     stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
    498     stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
    499     \
    500     tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
    501     tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
    502     tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
    503     tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
    504     \
    505     tmp0 = _mm_add_epi32(tmp0, rounding); \
    506     tmp1 = _mm_add_epi32(tmp1, rounding); \
    507     tmp2 = _mm_add_epi32(tmp2, rounding); \
    508     tmp3 = _mm_add_epi32(tmp3, rounding); \
    509     \
    510     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
    511     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
    512     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
    513     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
    514     \
    515     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
    516     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
    517   } \
    518   \
    519   /* Stage4  */ \
    520   out0 = _mm_adds_epi16(stp1_0, stp2_7); \
    521   out1 = _mm_adds_epi16(stp1_1, stp1_6); \
    522   out2 = _mm_adds_epi16(stp1_2, stp1_5); \
    523   out3 = _mm_adds_epi16(stp1_3, stp2_4); \
    524   out4 = _mm_subs_epi16(stp1_3, stp2_4); \
    525   out5 = _mm_subs_epi16(stp1_2, stp1_5); \
    526   out6 = _mm_subs_epi16(stp1_1, stp1_6); \
    527   out7 = _mm_subs_epi16(stp1_0, stp2_7); \
    528   }
    529 
    530 #define RECON_AND_STORE(dest, in_x) \
    531   {                                                     \
    532      __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
    533       d0 = _mm_unpacklo_epi8(d0, zero); \
    534       d0 = _mm_add_epi16(in_x, d0); \
    535       d0 = _mm_packus_epi16(d0, d0); \
    536       _mm_storel_epi64((__m128i *)(dest), d0); \
    537       dest += stride; \
    538   }
    539 
    540 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    541   const __m128i zero = _mm_setzero_si128();
    542   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    543   const __m128i final_rounding = _mm_set1_epi16(1<<4);
    544   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    545   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    546   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    547   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
    548   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    549   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    550   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    551   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    552 
    553   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    554   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    555   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
    556   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    557   int i;
    558 
    559   // Load input data.
    560   in0 = _mm_load_si128((const __m128i *)input);
    561   in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
    562   in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
    563   in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
    564   in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
    565   in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
    566   in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
    567   in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
    568 
    569   // 2-D
    570   for (i = 0; i < 2; i++) {
    571     // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
    572     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
    573                   in0, in1, in2, in3, in4, in5, in6, in7);
    574 
    575     // 4-stage 1D idct8x8
    576     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
    577              in0, in1, in2, in3, in4, in5, in6, in7);
    578   }
    579 
    580   // Final rounding and shift
    581   in0 = _mm_adds_epi16(in0, final_rounding);
    582   in1 = _mm_adds_epi16(in1, final_rounding);
    583   in2 = _mm_adds_epi16(in2, final_rounding);
    584   in3 = _mm_adds_epi16(in3, final_rounding);
    585   in4 = _mm_adds_epi16(in4, final_rounding);
    586   in5 = _mm_adds_epi16(in5, final_rounding);
    587   in6 = _mm_adds_epi16(in6, final_rounding);
    588   in7 = _mm_adds_epi16(in7, final_rounding);
    589 
    590   in0 = _mm_srai_epi16(in0, 5);
    591   in1 = _mm_srai_epi16(in1, 5);
    592   in2 = _mm_srai_epi16(in2, 5);
    593   in3 = _mm_srai_epi16(in3, 5);
    594   in4 = _mm_srai_epi16(in4, 5);
    595   in5 = _mm_srai_epi16(in5, 5);
    596   in6 = _mm_srai_epi16(in6, 5);
    597   in7 = _mm_srai_epi16(in7, 5);
    598 
    599   RECON_AND_STORE(dest, in0);
    600   RECON_AND_STORE(dest, in1);
    601   RECON_AND_STORE(dest, in2);
    602   RECON_AND_STORE(dest, in3);
    603   RECON_AND_STORE(dest, in4);
    604   RECON_AND_STORE(dest, in5);
    605   RECON_AND_STORE(dest, in6);
    606   RECON_AND_STORE(dest, in7);
    607 }
    608 
    609 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    610   __m128i dc_value;
    611   const __m128i zero = _mm_setzero_si128();
    612   int a;
    613 
    614   a = dct_const_round_shift(input[0] * cospi_16_64);
    615   a = dct_const_round_shift(a * cospi_16_64);
    616   a = ROUND_POWER_OF_TWO(a, 5);
    617 
    618   dc_value = _mm_set1_epi16(a);
    619 
    620   RECON_AND_STORE(dest, dc_value);
    621   RECON_AND_STORE(dest, dc_value);
    622   RECON_AND_STORE(dest, dc_value);
    623   RECON_AND_STORE(dest, dc_value);
    624   RECON_AND_STORE(dest, dc_value);
    625   RECON_AND_STORE(dest, dc_value);
    626   RECON_AND_STORE(dest, dc_value);
    627   RECON_AND_STORE(dest, dc_value);
    628 }
    629 
    630 // perform 8x8 transpose
    631 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
    632   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
    633   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
    634   const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
    635   const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
    636   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
    637   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
    638   const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
    639   const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
    640 
    641   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    642   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
    643   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    644   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
    645   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
    646   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
    647   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
    648   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
    649 
    650   res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
    651   res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
    652   res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
    653   res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
    654   res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
    655   res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
    656   res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
    657   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
    658 }
    659 
    660 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
    661   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
    662   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
    663   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
    664   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
    665 
    666   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    667   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    668   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
    669   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
    670 
    671   out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
    672   out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
    673   out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
    674   out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
    675 }
    676 
    677 static void idct8_sse2(__m128i *in) {
    678   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    679   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    680   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    681   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    682   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
    683   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    684   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    685   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    686   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    687 
    688   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    689   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    690   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
    691   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    692 
    693   // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
    694   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
    695                 in0, in1, in2, in3, in4, in5, in6, in7);
    696 
    697   // 4-stage 1D idct8x8
    698   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
    699            in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
    700 }
    701 
    702 static void iadst8_sse2(__m128i *in) {
    703   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
    704   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    705   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
    706   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    707   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
    708   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    709   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
    710   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    711   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    712   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    713   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    714   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    715   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    716   const __m128i k__const_0 = _mm_set1_epi16(0);
    717   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    718 
    719   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
    720   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    721   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
    722   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    723   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    724 
    725   // transpose
    726   array_transpose_8x8(in, in);
    727 
    728   // properly aligned for butterfly input
    729   in0  = in[7];
    730   in1  = in[0];
    731   in2  = in[5];
    732   in3  = in[2];
    733   in4  = in[3];
    734   in5  = in[4];
    735   in6  = in[1];
    736   in7  = in[6];
    737 
    738   // column transformation
    739   // stage 1
    740   // interleave and multiply/add into 32-bit integer
    741   s0 = _mm_unpacklo_epi16(in0, in1);
    742   s1 = _mm_unpackhi_epi16(in0, in1);
    743   s2 = _mm_unpacklo_epi16(in2, in3);
    744   s3 = _mm_unpackhi_epi16(in2, in3);
    745   s4 = _mm_unpacklo_epi16(in4, in5);
    746   s5 = _mm_unpackhi_epi16(in4, in5);
    747   s6 = _mm_unpacklo_epi16(in6, in7);
    748   s7 = _mm_unpackhi_epi16(in6, in7);
    749 
    750   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
    751   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
    752   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
    753   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
    754   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
    755   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
    756   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
    757   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
    758   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
    759   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
    760   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
    761   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
    762   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
    763   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
    764   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
    765   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
    766 
    767   // addition
    768   w0 = _mm_add_epi32(u0, u8);
    769   w1 = _mm_add_epi32(u1, u9);
    770   w2 = _mm_add_epi32(u2, u10);
    771   w3 = _mm_add_epi32(u3, u11);
    772   w4 = _mm_add_epi32(u4, u12);
    773   w5 = _mm_add_epi32(u5, u13);
    774   w6 = _mm_add_epi32(u6, u14);
    775   w7 = _mm_add_epi32(u7, u15);
    776   w8 = _mm_sub_epi32(u0, u8);
    777   w9 = _mm_sub_epi32(u1, u9);
    778   w10 = _mm_sub_epi32(u2, u10);
    779   w11 = _mm_sub_epi32(u3, u11);
    780   w12 = _mm_sub_epi32(u4, u12);
    781   w13 = _mm_sub_epi32(u5, u13);
    782   w14 = _mm_sub_epi32(u6, u14);
    783   w15 = _mm_sub_epi32(u7, u15);
    784 
    785   // shift and rounding
    786   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    787   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    788   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    789   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    790   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    791   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    792   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    793   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    794   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
    795   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
    796   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
    797   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
    798   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
    799   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
    800   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
    801   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
    802 
    803   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    804   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    805   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    806   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    807   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    808   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    809   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    810   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    811   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
    812   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
    813   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
    814   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
    815   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
    816   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
    817   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
    818   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
    819 
    820   // back to 16-bit and pack 8 integers into __m128i
    821   in[0] = _mm_packs_epi32(u0, u1);
    822   in[1] = _mm_packs_epi32(u2, u3);
    823   in[2] = _mm_packs_epi32(u4, u5);
    824   in[3] = _mm_packs_epi32(u6, u7);
    825   in[4] = _mm_packs_epi32(u8, u9);
    826   in[5] = _mm_packs_epi32(u10, u11);
    827   in[6] = _mm_packs_epi32(u12, u13);
    828   in[7] = _mm_packs_epi32(u14, u15);
    829 
    830   // stage 2
    831   s0 = _mm_add_epi16(in[0], in[2]);
    832   s1 = _mm_add_epi16(in[1], in[3]);
    833   s2 = _mm_sub_epi16(in[0], in[2]);
    834   s3 = _mm_sub_epi16(in[1], in[3]);
    835   u0 = _mm_unpacklo_epi16(in[4], in[5]);
    836   u1 = _mm_unpackhi_epi16(in[4], in[5]);
    837   u2 = _mm_unpacklo_epi16(in[6], in[7]);
    838   u3 = _mm_unpackhi_epi16(in[6], in[7]);
    839 
    840   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
    841   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
    842   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
    843   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
    844   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
    845   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
    846   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
    847   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
    848 
    849   w0 = _mm_add_epi32(v0, v4);
    850   w1 = _mm_add_epi32(v1, v5);
    851   w2 = _mm_add_epi32(v2, v6);
    852   w3 = _mm_add_epi32(v3, v7);
    853   w4 = _mm_sub_epi32(v0, v4);
    854   w5 = _mm_sub_epi32(v1, v5);
    855   w6 = _mm_sub_epi32(v2, v6);
    856   w7 = _mm_sub_epi32(v3, v7);
    857 
    858   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    859   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    860   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    861   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    862   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    863   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    864   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    865   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    866 
    867   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    868   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    869   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    870   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    871   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    872   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    873   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    874   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    875 
    876   // back to 16-bit intergers
    877   s4 = _mm_packs_epi32(u0, u1);
    878   s5 = _mm_packs_epi32(u2, u3);
    879   s6 = _mm_packs_epi32(u4, u5);
    880   s7 = _mm_packs_epi32(u6, u7);
    881 
    882   // stage 3
    883   u0 = _mm_unpacklo_epi16(s2, s3);
    884   u1 = _mm_unpackhi_epi16(s2, s3);
    885   u2 = _mm_unpacklo_epi16(s6, s7);
    886   u3 = _mm_unpackhi_epi16(s6, s7);
    887 
    888   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    889   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    890   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    891   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    892   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
    893   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
    894   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
    895   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
    896 
    897   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    898   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    899   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    900   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    901   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    902   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    903   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    904   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    905 
    906   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    907   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    908   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    909   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    910   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    911   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    912   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    913   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    914 
    915   s2 = _mm_packs_epi32(v0, v1);
    916   s3 = _mm_packs_epi32(v2, v3);
    917   s6 = _mm_packs_epi32(v4, v5);
    918   s7 = _mm_packs_epi32(v6, v7);
    919 
    920   in[0] = s0;
    921   in[1] = _mm_sub_epi16(k__const_0, s4);
    922   in[2] = s6;
    923   in[3] = _mm_sub_epi16(k__const_0, s2);
    924   in[4] = s3;
    925   in[5] = _mm_sub_epi16(k__const_0, s7);
    926   in[6] = s5;
    927   in[7] = _mm_sub_epi16(k__const_0, s1);
    928 }
    929 
    930 
    931 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
    932                             int tx_type) {
    933   __m128i in[8];
    934   const __m128i zero = _mm_setzero_si128();
    935   const __m128i final_rounding = _mm_set1_epi16(1<<4);
    936 
    937   // load input data
    938   in[0] = _mm_load_si128((const __m128i *)input);
    939   in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
    940   in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
    941   in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
    942   in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
    943   in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
    944   in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
    945   in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
    946 
    947   switch (tx_type) {
    948     case 0:  // DCT_DCT
    949       idct8_sse2(in);
    950       idct8_sse2(in);
    951       break;
    952     case 1:  // ADST_DCT
    953       idct8_sse2(in);
    954       iadst8_sse2(in);
    955       break;
    956     case 2:  // DCT_ADST
    957       iadst8_sse2(in);
    958       idct8_sse2(in);
    959       break;
    960     case 3:  // ADST_ADST
    961       iadst8_sse2(in);
    962       iadst8_sse2(in);
    963       break;
    964     default:
    965       assert(0);
    966       break;
    967   }
    968 
    969   // Final rounding and shift
    970   in[0] = _mm_adds_epi16(in[0], final_rounding);
    971   in[1] = _mm_adds_epi16(in[1], final_rounding);
    972   in[2] = _mm_adds_epi16(in[2], final_rounding);
    973   in[3] = _mm_adds_epi16(in[3], final_rounding);
    974   in[4] = _mm_adds_epi16(in[4], final_rounding);
    975   in[5] = _mm_adds_epi16(in[5], final_rounding);
    976   in[6] = _mm_adds_epi16(in[6], final_rounding);
    977   in[7] = _mm_adds_epi16(in[7], final_rounding);
    978 
    979   in[0] = _mm_srai_epi16(in[0], 5);
    980   in[1] = _mm_srai_epi16(in[1], 5);
    981   in[2] = _mm_srai_epi16(in[2], 5);
    982   in[3] = _mm_srai_epi16(in[3], 5);
    983   in[4] = _mm_srai_epi16(in[4], 5);
    984   in[5] = _mm_srai_epi16(in[5], 5);
    985   in[6] = _mm_srai_epi16(in[6], 5);
    986   in[7] = _mm_srai_epi16(in[7], 5);
    987 
    988   RECON_AND_STORE(dest, in[0]);
    989   RECON_AND_STORE(dest, in[1]);
    990   RECON_AND_STORE(dest, in[2]);
    991   RECON_AND_STORE(dest, in[3]);
    992   RECON_AND_STORE(dest, in[4]);
    993   RECON_AND_STORE(dest, in[5]);
    994   RECON_AND_STORE(dest, in[6]);
    995   RECON_AND_STORE(dest, in[7]);
    996 }
    997 
    998 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    999   const __m128i zero = _mm_setzero_si128();
   1000   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1001   const __m128i final_rounding = _mm_set1_epi16(1<<4);
   1002   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   1003   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   1004   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   1005   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
   1006   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   1007   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1008   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   1009   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   1010   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1011 
   1012   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
   1013   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
   1014   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
   1015   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1016 
   1017   // Rows. Load 4-row input data.
   1018   in0 = _mm_load_si128((const __m128i *)input);
   1019   in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
   1020   in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
   1021   in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
   1022 
   1023   // 8x4 Transpose
   1024   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
   1025   // Stage1
   1026   { //NOLINT
   1027     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
   1028     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
   1029 
   1030     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
   1031     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
   1032     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
   1033     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
   1034 
   1035     tmp0 = _mm_add_epi32(tmp0, rounding);
   1036     tmp2 = _mm_add_epi32(tmp2, rounding);
   1037     tmp4 = _mm_add_epi32(tmp4, rounding);
   1038     tmp6 = _mm_add_epi32(tmp6, rounding);
   1039     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   1040     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   1041     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
   1042     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
   1043 
   1044     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
   1045     stp1_5 = _mm_packs_epi32(tmp4, tmp6);
   1046   }
   1047 
   1048   // Stage2
   1049   { //NOLINT
   1050     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
   1051     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
   1052 
   1053     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
   1054     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
   1055     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
   1056     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
   1057 
   1058     tmp0 = _mm_add_epi32(tmp0, rounding);
   1059     tmp2 = _mm_add_epi32(tmp2, rounding);
   1060     tmp4 = _mm_add_epi32(tmp4, rounding);
   1061     tmp6 = _mm_add_epi32(tmp6, rounding);
   1062     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   1063     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   1064     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
   1065     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
   1066 
   1067     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
   1068     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
   1069 
   1070     tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
   1071     tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
   1072 
   1073     stp2_4 = tmp0;
   1074     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
   1075     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
   1076   }
   1077 
   1078   // Stage3
   1079   { //NOLINT
   1080     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
   1081 
   1082     tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
   1083     tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
   1084 
   1085     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
   1086     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
   1087 
   1088     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
   1089     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
   1090 
   1091     tmp0 = _mm_add_epi32(tmp0, rounding);
   1092     tmp2 = _mm_add_epi32(tmp2, rounding);
   1093     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   1094     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   1095 
   1096     stp1_5 = _mm_packs_epi32(tmp0, tmp2);
   1097   }
   1098 
   1099   // Stage4
   1100   tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
   1101   tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
   1102   tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
   1103   tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
   1104 
   1105   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
   1106 
   1107   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
   1108            in0, in1, in2, in3, in4, in5, in6, in7);
   1109   // Final rounding and shift
   1110   in0 = _mm_adds_epi16(in0, final_rounding);
   1111   in1 = _mm_adds_epi16(in1, final_rounding);
   1112   in2 = _mm_adds_epi16(in2, final_rounding);
   1113   in3 = _mm_adds_epi16(in3, final_rounding);
   1114   in4 = _mm_adds_epi16(in4, final_rounding);
   1115   in5 = _mm_adds_epi16(in5, final_rounding);
   1116   in6 = _mm_adds_epi16(in6, final_rounding);
   1117   in7 = _mm_adds_epi16(in7, final_rounding);
   1118 
   1119   in0 = _mm_srai_epi16(in0, 5);
   1120   in1 = _mm_srai_epi16(in1, 5);
   1121   in2 = _mm_srai_epi16(in2, 5);
   1122   in3 = _mm_srai_epi16(in3, 5);
   1123   in4 = _mm_srai_epi16(in4, 5);
   1124   in5 = _mm_srai_epi16(in5, 5);
   1125   in6 = _mm_srai_epi16(in6, 5);
   1126   in7 = _mm_srai_epi16(in7, 5);
   1127 
   1128   RECON_AND_STORE(dest, in0);
   1129   RECON_AND_STORE(dest, in1);
   1130   RECON_AND_STORE(dest, in2);
   1131   RECON_AND_STORE(dest, in3);
   1132   RECON_AND_STORE(dest, in4);
   1133   RECON_AND_STORE(dest, in5);
   1134   RECON_AND_STORE(dest, in6);
   1135   RECON_AND_STORE(dest, in7);
   1136 }
   1137 
   1138 #define IDCT16 \
   1139   /* Stage2 */ \
   1140   { \
   1141     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
   1142     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
   1143     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
   1144     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
   1145     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
   1146     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
   1147     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
   1148     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
   1149     \
   1150     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
   1151                            stg2_0, stg2_1, stg2_2, stg2_3, \
   1152                            stp2_8, stp2_15, stp2_9, stp2_14) \
   1153     \
   1154     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
   1155                            stg2_4, stg2_5, stg2_6, stg2_7, \
   1156                            stp2_10, stp2_13, stp2_11, stp2_12) \
   1157   } \
   1158     \
   1159   /* Stage3 */ \
   1160   { \
   1161     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
   1162     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
   1163     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
   1164     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
   1165     \
   1166     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
   1167                            stg3_0, stg3_1, stg3_2, stg3_3, \
   1168                            stp1_4, stp1_7, stp1_5, stp1_6) \
   1169     \
   1170     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
   1171     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
   1172     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
   1173     stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
   1174     \
   1175     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
   1176     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
   1177     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
   1178     stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
   1179   } \
   1180   \
   1181   /* Stage4 */ \
   1182   { \
   1183     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
   1184     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
   1185     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
   1186     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
   1187     \
   1188     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   1189     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
   1190     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   1191     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   1192     \
   1193     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
   1194                            stg4_0, stg4_1, stg4_2, stg4_3, \
   1195                            stp2_0, stp2_1, stp2_2, stp2_3) \
   1196     \
   1197     stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
   1198     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
   1199     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
   1200     stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
   1201     \
   1202     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
   1203                            stg4_4, stg4_5, stg4_6, stg4_7, \
   1204                            stp2_9, stp2_14, stp2_10, stp2_13) \
   1205   } \
   1206     \
   1207   /* Stage5 */ \
   1208   { \
   1209     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
   1210     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
   1211     \
   1212     stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
   1213     stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
   1214     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
   1215     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
   1216     \
   1217     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
   1218     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
   1219     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
   1220     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
   1221     \
   1222     tmp0 = _mm_add_epi32(tmp0, rounding); \
   1223     tmp1 = _mm_add_epi32(tmp1, rounding); \
   1224     tmp2 = _mm_add_epi32(tmp2, rounding); \
   1225     tmp3 = _mm_add_epi32(tmp3, rounding); \
   1226     \
   1227     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
   1228     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
   1229     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
   1230     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
   1231     \
   1232     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
   1233     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
   1234     \
   1235     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
   1236     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
   1237     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
   1238     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
   1239     \
   1240     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
   1241     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
   1242     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
   1243     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
   1244   } \
   1245     \
   1246   /* Stage6 */ \
   1247   { \
   1248     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   1249     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   1250     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
   1251     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
   1252     \
   1253     stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
   1254     stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
   1255     stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
   1256     stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
   1257     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
   1258     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
   1259     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
   1260     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
   1261     \
   1262     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
   1263                            stg6_0, stg4_0, stg6_0, stg4_0, \
   1264                            stp2_10, stp2_13, stp2_11, stp2_12) \
   1265   }
   1266 
   1267 #define IDCT16_10 \
   1268     /* Stage2 */ \
   1269     { \
   1270       const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
   1271       const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
   1272       const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
   1273       const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
   1274       \
   1275       MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
   1276                              stg2_0, stg2_1, stg2_6, stg2_7, \
   1277                              stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
   1278     } \
   1279       \
   1280     /* Stage3 */ \
   1281     { \
   1282       const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
   1283       const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
   1284       \
   1285       MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
   1286                                stg3_0, stg3_1,  \
   1287                                stp2_4, stp2_7) \
   1288       \
   1289       stp1_9  =  stp1_8_0; \
   1290       stp1_10 =  stp1_11;  \
   1291       \
   1292       stp1_13 = stp1_12_0; \
   1293       stp1_14 = stp1_15;   \
   1294     } \
   1295     \
   1296     /* Stage4 */ \
   1297     { \
   1298       const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
   1299       const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
   1300       \
   1301       const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   1302       const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
   1303       const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   1304       const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   1305       \
   1306       MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
   1307                                stg4_0, stg4_1, \
   1308                                stp1_0, stp1_1) \
   1309       stp2_5 = stp2_4; \
   1310       stp2_6 = stp2_7; \
   1311       \
   1312       MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
   1313                              stg4_4, stg4_5, stg4_6, stg4_7, \
   1314                              stp2_9, stp2_14, stp2_10, stp2_13) \
   1315     } \
   1316       \
   1317     /* Stage5 */ \
   1318     { \
   1319       const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
   1320       const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
   1321       \
   1322       stp1_2 = stp1_1; \
   1323       stp1_3 = stp1_0; \
   1324       \
   1325       tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
   1326       tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
   1327       tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
   1328       tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
   1329       \
   1330       tmp0 = _mm_add_epi32(tmp0, rounding); \
   1331       tmp1 = _mm_add_epi32(tmp1, rounding); \
   1332       tmp2 = _mm_add_epi32(tmp2, rounding); \
   1333       tmp3 = _mm_add_epi32(tmp3, rounding); \
   1334       \
   1335       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
   1336       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
   1337       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
   1338       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
   1339       \
   1340       stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
   1341       stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
   1342       \
   1343       stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
   1344       stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
   1345       stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
   1346       stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
   1347       \
   1348       stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
   1349       stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
   1350       stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
   1351       stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
   1352     } \
   1353       \
   1354     /* Stage6 */ \
   1355     { \
   1356       const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   1357       const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   1358       const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
   1359       const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
   1360       \
   1361       stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
   1362       stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
   1363       stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
   1364       stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
   1365       stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
   1366       stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
   1367       stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
   1368       stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
   1369       \
   1370       MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
   1371                              stg6_0, stg4_0, stg6_0, stg4_0, \
   1372                              stp2_10, stp2_13, stp2_11, stp2_12) \
   1373     }
   1374 
   1375 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
   1376                                 int stride) {
   1377   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1378   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   1379   const __m128i zero = _mm_setzero_si128();
   1380 
   1381   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   1382   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   1383   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
   1384   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
   1385   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
   1386   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
   1387   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   1388   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   1389 
   1390   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   1391   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   1392   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   1393   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
   1394 
   1395   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   1396   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1397   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   1398   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   1399   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1400   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1401   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   1402   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1403 
   1404   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1405 
   1406   __m128i in[16], l[16], r[16], *curr1;
   1407   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
   1408           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
   1409           stp1_8_0, stp1_12_0;
   1410   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   1411           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   1412   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1413   int i;
   1414 
   1415   curr1 = l;
   1416   for (i = 0; i < 2; i++) {
   1417       // 1-D idct
   1418 
   1419       // Load input data.
   1420       in[0] = _mm_load_si128((const __m128i *)input);
   1421       in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
   1422       in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
   1423       in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
   1424       in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
   1425       in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
   1426       in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
   1427       in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
   1428       in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
   1429       in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
   1430       in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
   1431       in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
   1432       in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
   1433       in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
   1434       in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
   1435       in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
   1436 
   1437       array_transpose_8x8(in, in);
   1438       array_transpose_8x8(in+8, in+8);
   1439 
   1440       IDCT16
   1441 
   1442       // Stage7
   1443       curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
   1444       curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
   1445       curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
   1446       curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
   1447       curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
   1448       curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
   1449       curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
   1450       curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
   1451       curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
   1452       curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
   1453       curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
   1454       curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
   1455       curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
   1456       curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
   1457       curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
   1458       curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
   1459 
   1460       curr1 = r;
   1461       input += 128;
   1462   }
   1463   for (i = 0; i < 2; i++) {
   1464       // 1-D idct
   1465       array_transpose_8x8(l+i*8, in);
   1466       array_transpose_8x8(r+i*8, in+8);
   1467 
   1468       IDCT16
   1469 
   1470       // 2-D
   1471       in[0] = _mm_add_epi16(stp2_0, stp1_15);
   1472       in[1] = _mm_add_epi16(stp2_1, stp1_14);
   1473       in[2] = _mm_add_epi16(stp2_2, stp2_13);
   1474       in[3] = _mm_add_epi16(stp2_3, stp2_12);
   1475       in[4] = _mm_add_epi16(stp2_4, stp2_11);
   1476       in[5] = _mm_add_epi16(stp2_5, stp2_10);
   1477       in[6] = _mm_add_epi16(stp2_6, stp1_9);
   1478       in[7] = _mm_add_epi16(stp2_7, stp1_8);
   1479       in[8] = _mm_sub_epi16(stp2_7, stp1_8);
   1480       in[9] = _mm_sub_epi16(stp2_6, stp1_9);
   1481       in[10] = _mm_sub_epi16(stp2_5, stp2_10);
   1482       in[11] = _mm_sub_epi16(stp2_4, stp2_11);
   1483       in[12] = _mm_sub_epi16(stp2_3, stp2_12);
   1484       in[13] = _mm_sub_epi16(stp2_2, stp2_13);
   1485       in[14] = _mm_sub_epi16(stp2_1, stp1_14);
   1486       in[15] = _mm_sub_epi16(stp2_0, stp1_15);
   1487 
   1488       // Final rounding and shift
   1489       in[0] = _mm_adds_epi16(in[0], final_rounding);
   1490       in[1] = _mm_adds_epi16(in[1], final_rounding);
   1491       in[2] = _mm_adds_epi16(in[2], final_rounding);
   1492       in[3] = _mm_adds_epi16(in[3], final_rounding);
   1493       in[4] = _mm_adds_epi16(in[4], final_rounding);
   1494       in[5] = _mm_adds_epi16(in[5], final_rounding);
   1495       in[6] = _mm_adds_epi16(in[6], final_rounding);
   1496       in[7] = _mm_adds_epi16(in[7], final_rounding);
   1497       in[8] = _mm_adds_epi16(in[8], final_rounding);
   1498       in[9] = _mm_adds_epi16(in[9], final_rounding);
   1499       in[10] = _mm_adds_epi16(in[10], final_rounding);
   1500       in[11] = _mm_adds_epi16(in[11], final_rounding);
   1501       in[12] = _mm_adds_epi16(in[12], final_rounding);
   1502       in[13] = _mm_adds_epi16(in[13], final_rounding);
   1503       in[14] = _mm_adds_epi16(in[14], final_rounding);
   1504       in[15] = _mm_adds_epi16(in[15], final_rounding);
   1505 
   1506       in[0] = _mm_srai_epi16(in[0], 6);
   1507       in[1] = _mm_srai_epi16(in[1], 6);
   1508       in[2] = _mm_srai_epi16(in[2], 6);
   1509       in[3] = _mm_srai_epi16(in[3], 6);
   1510       in[4] = _mm_srai_epi16(in[4], 6);
   1511       in[5] = _mm_srai_epi16(in[5], 6);
   1512       in[6] = _mm_srai_epi16(in[6], 6);
   1513       in[7] = _mm_srai_epi16(in[7], 6);
   1514       in[8] = _mm_srai_epi16(in[8], 6);
   1515       in[9] = _mm_srai_epi16(in[9], 6);
   1516       in[10] = _mm_srai_epi16(in[10], 6);
   1517       in[11] = _mm_srai_epi16(in[11], 6);
   1518       in[12] = _mm_srai_epi16(in[12], 6);
   1519       in[13] = _mm_srai_epi16(in[13], 6);
   1520       in[14] = _mm_srai_epi16(in[14], 6);
   1521       in[15] = _mm_srai_epi16(in[15], 6);
   1522 
   1523       RECON_AND_STORE(dest, in[0]);
   1524       RECON_AND_STORE(dest, in[1]);
   1525       RECON_AND_STORE(dest, in[2]);
   1526       RECON_AND_STORE(dest, in[3]);
   1527       RECON_AND_STORE(dest, in[4]);
   1528       RECON_AND_STORE(dest, in[5]);
   1529       RECON_AND_STORE(dest, in[6]);
   1530       RECON_AND_STORE(dest, in[7]);
   1531       RECON_AND_STORE(dest, in[8]);
   1532       RECON_AND_STORE(dest, in[9]);
   1533       RECON_AND_STORE(dest, in[10]);
   1534       RECON_AND_STORE(dest, in[11]);
   1535       RECON_AND_STORE(dest, in[12]);
   1536       RECON_AND_STORE(dest, in[13]);
   1537       RECON_AND_STORE(dest, in[14]);
   1538       RECON_AND_STORE(dest, in[15]);
   1539 
   1540       dest += 8 - (stride * 16);
   1541   }
   1542 }
   1543 
   1544 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   1545   __m128i dc_value;
   1546   const __m128i zero = _mm_setzero_si128();
   1547   int a, i;
   1548 
   1549   a = dct_const_round_shift(input[0] * cospi_16_64);
   1550   a = dct_const_round_shift(a * cospi_16_64);
   1551   a = ROUND_POWER_OF_TWO(a, 6);
   1552 
   1553   dc_value = _mm_set1_epi16(a);
   1554 
   1555   for (i = 0; i < 2; ++i) {
   1556     RECON_AND_STORE(dest, dc_value);
   1557     RECON_AND_STORE(dest, dc_value);
   1558     RECON_AND_STORE(dest, dc_value);
   1559     RECON_AND_STORE(dest, dc_value);
   1560     RECON_AND_STORE(dest, dc_value);
   1561     RECON_AND_STORE(dest, dc_value);
   1562     RECON_AND_STORE(dest, dc_value);
   1563     RECON_AND_STORE(dest, dc_value);
   1564     RECON_AND_STORE(dest, dc_value);
   1565     RECON_AND_STORE(dest, dc_value);
   1566     RECON_AND_STORE(dest, dc_value);
   1567     RECON_AND_STORE(dest, dc_value);
   1568     RECON_AND_STORE(dest, dc_value);
   1569     RECON_AND_STORE(dest, dc_value);
   1570     RECON_AND_STORE(dest, dc_value);
   1571     RECON_AND_STORE(dest, dc_value);
   1572     dest += 8 - (stride * 16);
   1573   }
   1574 }
   1575 
   1576 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   1577   __m128i tbuf[8];
   1578   array_transpose_8x8(res0, res0);
   1579   array_transpose_8x8(res1, tbuf);
   1580   array_transpose_8x8(res0 + 8, res1);
   1581   array_transpose_8x8(res1 + 8, res1 + 8);
   1582 
   1583   res0[8] = tbuf[0];
   1584   res0[9] = tbuf[1];
   1585   res0[10] = tbuf[2];
   1586   res0[11] = tbuf[3];
   1587   res0[12] = tbuf[4];
   1588   res0[13] = tbuf[5];
   1589   res0[14] = tbuf[6];
   1590   res0[15] = tbuf[7];
   1591 }
   1592 
   1593 static void iadst16_8col(__m128i *in) {
   1594   // perform 16x16 1-D ADST for 8 columns
   1595   __m128i s[16], x[16], u[32], v[32];
   1596   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
   1597   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   1598   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
   1599   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   1600   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
   1601   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   1602   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
   1603   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   1604   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
   1605   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   1606   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   1607   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   1608   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
   1609   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   1610   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
   1611   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   1612   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   1613   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   1614   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
   1615   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   1616   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
   1617   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
   1618   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   1619   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   1620   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   1621   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
   1622   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   1623   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1624   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1625   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1626   const __m128i kZero = _mm_set1_epi16(0);
   1627 
   1628   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   1629   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
   1630   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
   1631   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
   1632   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
   1633   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
   1634   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
   1635   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
   1636   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
   1637   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
   1638   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
   1639   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
   1640   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
   1641   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
   1642   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
   1643   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
   1644 
   1645   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
   1646   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
   1647   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
   1648   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
   1649   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
   1650   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
   1651   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
   1652   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
   1653   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
   1654   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
   1655   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
   1656   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
   1657   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
   1658   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
   1659   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
   1660   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
   1661   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
   1662   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
   1663   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
   1664   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
   1665   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
   1666   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
   1667   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
   1668   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
   1669   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
   1670   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
   1671   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
   1672   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
   1673   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
   1674   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
   1675   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
   1676   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
   1677 
   1678   u[0] = _mm_add_epi32(v[0], v[16]);
   1679   u[1] = _mm_add_epi32(v[1], v[17]);
   1680   u[2] = _mm_add_epi32(v[2], v[18]);
   1681   u[3] = _mm_add_epi32(v[3], v[19]);
   1682   u[4] = _mm_add_epi32(v[4], v[20]);
   1683   u[5] = _mm_add_epi32(v[5], v[21]);
   1684   u[6] = _mm_add_epi32(v[6], v[22]);
   1685   u[7] = _mm_add_epi32(v[7], v[23]);
   1686   u[8] = _mm_add_epi32(v[8], v[24]);
   1687   u[9] = _mm_add_epi32(v[9], v[25]);
   1688   u[10] = _mm_add_epi32(v[10], v[26]);
   1689   u[11] = _mm_add_epi32(v[11], v[27]);
   1690   u[12] = _mm_add_epi32(v[12], v[28]);
   1691   u[13] = _mm_add_epi32(v[13], v[29]);
   1692   u[14] = _mm_add_epi32(v[14], v[30]);
   1693   u[15] = _mm_add_epi32(v[15], v[31]);
   1694   u[16] = _mm_sub_epi32(v[0], v[16]);
   1695   u[17] = _mm_sub_epi32(v[1], v[17]);
   1696   u[18] = _mm_sub_epi32(v[2], v[18]);
   1697   u[19] = _mm_sub_epi32(v[3], v[19]);
   1698   u[20] = _mm_sub_epi32(v[4], v[20]);
   1699   u[21] = _mm_sub_epi32(v[5], v[21]);
   1700   u[22] = _mm_sub_epi32(v[6], v[22]);
   1701   u[23] = _mm_sub_epi32(v[7], v[23]);
   1702   u[24] = _mm_sub_epi32(v[8], v[24]);
   1703   u[25] = _mm_sub_epi32(v[9], v[25]);
   1704   u[26] = _mm_sub_epi32(v[10], v[26]);
   1705   u[27] = _mm_sub_epi32(v[11], v[27]);
   1706   u[28] = _mm_sub_epi32(v[12], v[28]);
   1707   u[29] = _mm_sub_epi32(v[13], v[29]);
   1708   u[30] = _mm_sub_epi32(v[14], v[30]);
   1709   u[31] = _mm_sub_epi32(v[15], v[31]);
   1710 
   1711   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1712   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1713   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1714   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1715   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1716   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1717   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1718   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1719   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   1720   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   1721   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   1722   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   1723   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   1724   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   1725   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   1726   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   1727   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
   1728   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
   1729   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
   1730   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
   1731   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
   1732   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
   1733   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
   1734   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
   1735   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
   1736   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
   1737   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
   1738   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
   1739   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
   1740   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
   1741   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
   1742   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
   1743 
   1744   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   1745   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   1746   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   1747   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   1748   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   1749   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   1750   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   1751   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   1752   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   1753   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   1754   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   1755   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   1756   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   1757   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   1758   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   1759   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   1760   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
   1761   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
   1762   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
   1763   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
   1764   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
   1765   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
   1766   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
   1767   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
   1768   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
   1769   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
   1770   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
   1771   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
   1772   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
   1773   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
   1774   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
   1775   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
   1776 
   1777   s[0] = _mm_packs_epi32(u[0], u[1]);
   1778   s[1] = _mm_packs_epi32(u[2], u[3]);
   1779   s[2] = _mm_packs_epi32(u[4], u[5]);
   1780   s[3] = _mm_packs_epi32(u[6], u[7]);
   1781   s[4] = _mm_packs_epi32(u[8], u[9]);
   1782   s[5] = _mm_packs_epi32(u[10], u[11]);
   1783   s[6] = _mm_packs_epi32(u[12], u[13]);
   1784   s[7] = _mm_packs_epi32(u[14], u[15]);
   1785   s[8] = _mm_packs_epi32(u[16], u[17]);
   1786   s[9] = _mm_packs_epi32(u[18], u[19]);
   1787   s[10] = _mm_packs_epi32(u[20], u[21]);
   1788   s[11] = _mm_packs_epi32(u[22], u[23]);
   1789   s[12] = _mm_packs_epi32(u[24], u[25]);
   1790   s[13] = _mm_packs_epi32(u[26], u[27]);
   1791   s[14] = _mm_packs_epi32(u[28], u[29]);
   1792   s[15] = _mm_packs_epi32(u[30], u[31]);
   1793 
   1794   // stage 2
   1795   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
   1796   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
   1797   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
   1798   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
   1799   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
   1800   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
   1801   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   1802   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   1803 
   1804   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
   1805   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
   1806   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
   1807   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
   1808   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
   1809   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
   1810   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
   1811   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
   1812   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
   1813   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
   1814   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
   1815   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
   1816   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
   1817   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
   1818   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
   1819   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
   1820 
   1821   u[0] = _mm_add_epi32(v[0], v[8]);
   1822   u[1] = _mm_add_epi32(v[1], v[9]);
   1823   u[2] = _mm_add_epi32(v[2], v[10]);
   1824   u[3] = _mm_add_epi32(v[3], v[11]);
   1825   u[4] = _mm_add_epi32(v[4], v[12]);
   1826   u[5] = _mm_add_epi32(v[5], v[13]);
   1827   u[6] = _mm_add_epi32(v[6], v[14]);
   1828   u[7] = _mm_add_epi32(v[7], v[15]);
   1829   u[8] = _mm_sub_epi32(v[0], v[8]);
   1830   u[9] = _mm_sub_epi32(v[1], v[9]);
   1831   u[10] = _mm_sub_epi32(v[2], v[10]);
   1832   u[11] = _mm_sub_epi32(v[3], v[11]);
   1833   u[12] = _mm_sub_epi32(v[4], v[12]);
   1834   u[13] = _mm_sub_epi32(v[5], v[13]);
   1835   u[14] = _mm_sub_epi32(v[6], v[14]);
   1836   u[15] = _mm_sub_epi32(v[7], v[15]);
   1837 
   1838   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1839   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1840   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1841   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1842   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1843   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1844   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1845   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1846   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   1847   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   1848   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   1849   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   1850   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   1851   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   1852   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   1853   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   1854 
   1855   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   1856   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   1857   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   1858   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   1859   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   1860   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   1861   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   1862   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   1863   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   1864   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   1865   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   1866   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   1867   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   1868   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   1869   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   1870   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   1871 
   1872   x[0] = _mm_add_epi16(s[0], s[4]);
   1873   x[1] = _mm_add_epi16(s[1], s[5]);
   1874   x[2] = _mm_add_epi16(s[2], s[6]);
   1875   x[3] = _mm_add_epi16(s[3], s[7]);
   1876   x[4] = _mm_sub_epi16(s[0], s[4]);
   1877   x[5] = _mm_sub_epi16(s[1], s[5]);
   1878   x[6] = _mm_sub_epi16(s[2], s[6]);
   1879   x[7] = _mm_sub_epi16(s[3], s[7]);
   1880   x[8] = _mm_packs_epi32(u[0], u[1]);
   1881   x[9] = _mm_packs_epi32(u[2], u[3]);
   1882   x[10] = _mm_packs_epi32(u[4], u[5]);
   1883   x[11] = _mm_packs_epi32(u[6], u[7]);
   1884   x[12] = _mm_packs_epi32(u[8], u[9]);
   1885   x[13] = _mm_packs_epi32(u[10], u[11]);
   1886   x[14] = _mm_packs_epi32(u[12], u[13]);
   1887   x[15] = _mm_packs_epi32(u[14], u[15]);
   1888 
   1889   // stage 3
   1890   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
   1891   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
   1892   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
   1893   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
   1894   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
   1895   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
   1896   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
   1897   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
   1898 
   1899   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
   1900   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
   1901   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
   1902   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
   1903   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
   1904   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
   1905   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
   1906   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
   1907   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
   1908   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
   1909   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
   1910   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
   1911   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
   1912   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
   1913   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
   1914   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
   1915 
   1916   u[0] = _mm_add_epi32(v[0], v[4]);
   1917   u[1] = _mm_add_epi32(v[1], v[5]);
   1918   u[2] = _mm_add_epi32(v[2], v[6]);
   1919   u[3] = _mm_add_epi32(v[3], v[7]);
   1920   u[4] = _mm_sub_epi32(v[0], v[4]);
   1921   u[5] = _mm_sub_epi32(v[1], v[5]);
   1922   u[6] = _mm_sub_epi32(v[2], v[6]);
   1923   u[7] = _mm_sub_epi32(v[3], v[7]);
   1924   u[8] = _mm_add_epi32(v[8], v[12]);
   1925   u[9] = _mm_add_epi32(v[9], v[13]);
   1926   u[10] = _mm_add_epi32(v[10], v[14]);
   1927   u[11] = _mm_add_epi32(v[11], v[15]);
   1928   u[12] = _mm_sub_epi32(v[8], v[12]);
   1929   u[13] = _mm_sub_epi32(v[9], v[13]);
   1930   u[14] = _mm_sub_epi32(v[10], v[14]);
   1931   u[15] = _mm_sub_epi32(v[11], v[15]);
   1932 
   1933   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1934   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1935   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1936   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1937   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1938   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1939   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1940   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1941   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   1942   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   1943   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   1944   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   1945   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   1946   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   1947   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   1948   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   1949 
   1950   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1951   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1952   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1953   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1954   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1955   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1956   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1957   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1958   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   1959   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   1960   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   1961   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   1962   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   1963   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   1964   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   1965   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   1966 
   1967   s[0] = _mm_add_epi16(x[0], x[2]);
   1968   s[1] = _mm_add_epi16(x[1], x[3]);
   1969   s[2] = _mm_sub_epi16(x[0], x[2]);
   1970   s[3] = _mm_sub_epi16(x[1], x[3]);
   1971   s[4] = _mm_packs_epi32(v[0], v[1]);
   1972   s[5] = _mm_packs_epi32(v[2], v[3]);
   1973   s[6] = _mm_packs_epi32(v[4], v[5]);
   1974   s[7] = _mm_packs_epi32(v[6], v[7]);
   1975   s[8] = _mm_add_epi16(x[8], x[10]);
   1976   s[9] = _mm_add_epi16(x[9], x[11]);
   1977   s[10] = _mm_sub_epi16(x[8], x[10]);
   1978   s[11] = _mm_sub_epi16(x[9], x[11]);
   1979   s[12] = _mm_packs_epi32(v[8], v[9]);
   1980   s[13] = _mm_packs_epi32(v[10], v[11]);
   1981   s[14] = _mm_packs_epi32(v[12], v[13]);
   1982   s[15] = _mm_packs_epi32(v[14], v[15]);
   1983 
   1984   // stage 4
   1985   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
   1986   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
   1987   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
   1988   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
   1989   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
   1990   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
   1991   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   1992   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   1993 
   1994   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
   1995   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
   1996   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   1997   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
   1998   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   1999   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   2000   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   2001   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   2002   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
   2003   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
   2004   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
   2005   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
   2006   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
   2007   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
   2008   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
   2009   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
   2010 
   2011   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2012   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2013   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2014   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2015   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2016   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2017   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2018   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2019   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2020   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2021   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2022   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2023   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2024   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2025   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2026   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2027 
   2028   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2029   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2030   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2031   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2032   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2033   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2034   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2035   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2036   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2037   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2038   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2039   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2040   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2041   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2042   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2043   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2044 
   2045   in[0] = s[0];
   2046   in[1] = _mm_sub_epi16(kZero, s[8]);
   2047   in[2] = s[12];
   2048   in[3] = _mm_sub_epi16(kZero, s[4]);
   2049   in[4] = _mm_packs_epi32(v[4], v[5]);
   2050   in[5] = _mm_packs_epi32(v[12], v[13]);
   2051   in[6] = _mm_packs_epi32(v[8], v[9]);
   2052   in[7] = _mm_packs_epi32(v[0], v[1]);
   2053   in[8] = _mm_packs_epi32(v[2], v[3]);
   2054   in[9] = _mm_packs_epi32(v[10], v[11]);
   2055   in[10] = _mm_packs_epi32(v[14], v[15]);
   2056   in[11] = _mm_packs_epi32(v[6], v[7]);
   2057   in[12] = s[5];
   2058   in[13] = _mm_sub_epi16(kZero, s[13]);
   2059   in[14] = s[9];
   2060   in[15] = _mm_sub_epi16(kZero, s[1]);
   2061 }
   2062 
   2063 static void idct16_8col(__m128i *in) {
   2064   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   2065   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   2066   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
   2067   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
   2068   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
   2069   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
   2070   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   2071   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
   2072   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   2073   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   2074   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   2075   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
   2076   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   2077   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   2078   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   2079   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   2080   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   2081   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   2082   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   2083   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   2084   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   2085   __m128i v[16], u[16], s[16], t[16];
   2086 
   2087   // stage 1
   2088   s[0] = in[0];
   2089   s[1] = in[8];
   2090   s[2] = in[4];
   2091   s[3] = in[12];
   2092   s[4] = in[2];
   2093   s[5] = in[10];
   2094   s[6] = in[6];
   2095   s[7] = in[14];
   2096   s[8] = in[1];
   2097   s[9] = in[9];
   2098   s[10] = in[5];
   2099   s[11] = in[13];
   2100   s[12] = in[3];
   2101   s[13] = in[11];
   2102   s[14] = in[7];
   2103   s[15] = in[15];
   2104 
   2105   // stage 2
   2106   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
   2107   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
   2108   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
   2109   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
   2110   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
   2111   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
   2112   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
   2113   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
   2114 
   2115   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
   2116   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
   2117   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
   2118   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
   2119   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
   2120   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
   2121   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
   2122   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
   2123   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
   2124   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
   2125   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
   2126   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
   2127   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
   2128   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
   2129   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
   2130   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
   2131 
   2132   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2133   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2134   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2135   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2136   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2137   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2138   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2139   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2140   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2141   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2142   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2143   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2144   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2145   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2146   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2147   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2148 
   2149   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2150   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2151   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2152   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2153   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2154   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2155   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2156   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2157   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2158   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2159   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2160   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2161   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2162   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2163   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2164   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2165 
   2166   s[8]  = _mm_packs_epi32(u[0], u[1]);
   2167   s[15] = _mm_packs_epi32(u[2], u[3]);
   2168   s[9]  = _mm_packs_epi32(u[4], u[5]);
   2169   s[14] = _mm_packs_epi32(u[6], u[7]);
   2170   s[10] = _mm_packs_epi32(u[8], u[9]);
   2171   s[13] = _mm_packs_epi32(u[10], u[11]);
   2172   s[11] = _mm_packs_epi32(u[12], u[13]);
   2173   s[12] = _mm_packs_epi32(u[14], u[15]);
   2174 
   2175   // stage 3
   2176   t[0] = s[0];
   2177   t[1] = s[1];
   2178   t[2] = s[2];
   2179   t[3] = s[3];
   2180   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
   2181   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
   2182   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
   2183   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
   2184 
   2185   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
   2186   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
   2187   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
   2188   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
   2189   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
   2190   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
   2191   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
   2192   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
   2193 
   2194   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2195   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2196   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2197   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2198   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2199   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2200   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2201   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2202 
   2203   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2204   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2205   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2206   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2207   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2208   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2209   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2210   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2211 
   2212   t[4] = _mm_packs_epi32(u[0], u[1]);
   2213   t[7] = _mm_packs_epi32(u[2], u[3]);
   2214   t[5] = _mm_packs_epi32(u[4], u[5]);
   2215   t[6] = _mm_packs_epi32(u[6], u[7]);
   2216   t[8] = _mm_add_epi16(s[8], s[9]);
   2217   t[9] = _mm_sub_epi16(s[8], s[9]);
   2218   t[10] = _mm_sub_epi16(s[11], s[10]);
   2219   t[11] = _mm_add_epi16(s[10], s[11]);
   2220   t[12] = _mm_add_epi16(s[12], s[13]);
   2221   t[13] = _mm_sub_epi16(s[12], s[13]);
   2222   t[14] = _mm_sub_epi16(s[15], s[14]);
   2223   t[15] = _mm_add_epi16(s[14], s[15]);
   2224 
   2225   // stage 4
   2226   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
   2227   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
   2228   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
   2229   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
   2230   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
   2231   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
   2232   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
   2233   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
   2234 
   2235   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   2236   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   2237   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   2238   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
   2239   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
   2240   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
   2241   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
   2242   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
   2243   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
   2244   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
   2245   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
   2246   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
   2247   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
   2248   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
   2249   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
   2250   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
   2251 
   2252   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2253   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2254   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2255   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2256   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2257   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2258   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2259   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2260   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2261   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2262   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2263   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2264   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2265   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2266   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2267   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2268 
   2269   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2270   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2271   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2272   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2273   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2274   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2275   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2276   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2277   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2278   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2279   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2280   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2281   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2282   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2283   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2284   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2285 
   2286   s[0] = _mm_packs_epi32(u[0], u[1]);
   2287   s[1] = _mm_packs_epi32(u[2], u[3]);
   2288   s[2] = _mm_packs_epi32(u[4], u[5]);
   2289   s[3] = _mm_packs_epi32(u[6], u[7]);
   2290   s[4] = _mm_add_epi16(t[4], t[5]);
   2291   s[5] = _mm_sub_epi16(t[4], t[5]);
   2292   s[6] = _mm_sub_epi16(t[7], t[6]);
   2293   s[7] = _mm_add_epi16(t[6], t[7]);
   2294   s[8] = t[8];
   2295   s[15] = t[15];
   2296   s[9]  = _mm_packs_epi32(u[8], u[9]);
   2297   s[14] = _mm_packs_epi32(u[10], u[11]);
   2298   s[10] = _mm_packs_epi32(u[12], u[13]);
   2299   s[13] = _mm_packs_epi32(u[14], u[15]);
   2300   s[11] = t[11];
   2301   s[12] = t[12];
   2302 
   2303   // stage 5
   2304   t[0] = _mm_add_epi16(s[0], s[3]);
   2305   t[1] = _mm_add_epi16(s[1], s[2]);
   2306   t[2] = _mm_sub_epi16(s[1], s[2]);
   2307   t[3] = _mm_sub_epi16(s[0], s[3]);
   2308   t[4] = s[4];
   2309   t[7] = s[7];
   2310 
   2311   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
   2312   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
   2313   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   2314   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   2315   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   2316   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   2317   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2318   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2319   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2320   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2321   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2322   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2323   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2324   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2325   t[5] = _mm_packs_epi32(u[0], u[1]);
   2326   t[6] = _mm_packs_epi32(u[2], u[3]);
   2327 
   2328   t[8] = _mm_add_epi16(s[8], s[11]);
   2329   t[9] = _mm_add_epi16(s[9], s[10]);
   2330   t[10] = _mm_sub_epi16(s[9], s[10]);
   2331   t[11] = _mm_sub_epi16(s[8], s[11]);
   2332   t[12] = _mm_sub_epi16(s[15], s[12]);
   2333   t[13] = _mm_sub_epi16(s[14], s[13]);
   2334   t[14] = _mm_add_epi16(s[13], s[14]);
   2335   t[15] = _mm_add_epi16(s[12], s[15]);
   2336 
   2337   // stage 6
   2338   s[0] = _mm_add_epi16(t[0], t[7]);
   2339   s[1] = _mm_add_epi16(t[1], t[6]);
   2340   s[2] = _mm_add_epi16(t[2], t[5]);
   2341   s[3] = _mm_add_epi16(t[3], t[4]);
   2342   s[4] = _mm_sub_epi16(t[3], t[4]);
   2343   s[5] = _mm_sub_epi16(t[2], t[5]);
   2344   s[6] = _mm_sub_epi16(t[1], t[6]);
   2345   s[7] = _mm_sub_epi16(t[0], t[7]);
   2346   s[8] = t[8];
   2347   s[9] = t[9];
   2348 
   2349   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
   2350   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
   2351   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
   2352   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
   2353 
   2354   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   2355   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   2356   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   2357   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   2358   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   2359   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   2360   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   2361   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   2362 
   2363   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2364   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2365   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2366   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2367   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2368   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2369   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2370   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2371 
   2372   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2373   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2374   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2375   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2376   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2377   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2378   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2379   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2380 
   2381   s[10] = _mm_packs_epi32(u[0], u[1]);
   2382   s[13] = _mm_packs_epi32(u[2], u[3]);
   2383   s[11] = _mm_packs_epi32(u[4], u[5]);
   2384   s[12] = _mm_packs_epi32(u[6], u[7]);
   2385   s[14] = t[14];
   2386   s[15] = t[15];
   2387 
   2388   // stage 7
   2389   in[0] = _mm_add_epi16(s[0], s[15]);
   2390   in[1] = _mm_add_epi16(s[1], s[14]);
   2391   in[2] = _mm_add_epi16(s[2], s[13]);
   2392   in[3] = _mm_add_epi16(s[3], s[12]);
   2393   in[4] = _mm_add_epi16(s[4], s[11]);
   2394   in[5] = _mm_add_epi16(s[5], s[10]);
   2395   in[6] = _mm_add_epi16(s[6], s[9]);
   2396   in[7] = _mm_add_epi16(s[7], s[8]);
   2397   in[8] = _mm_sub_epi16(s[7], s[8]);
   2398   in[9] = _mm_sub_epi16(s[6], s[9]);
   2399   in[10] = _mm_sub_epi16(s[5], s[10]);
   2400   in[11] = _mm_sub_epi16(s[4], s[11]);
   2401   in[12] = _mm_sub_epi16(s[3], s[12]);
   2402   in[13] = _mm_sub_epi16(s[2], s[13]);
   2403   in[14] = _mm_sub_epi16(s[1], s[14]);
   2404   in[15] = _mm_sub_epi16(s[0], s[15]);
   2405 }
   2406 
   2407 static void idct16_sse2(__m128i *in0, __m128i *in1) {
   2408   array_transpose_16x16(in0, in1);
   2409   idct16_8col(in0);
   2410   idct16_8col(in1);
   2411 }
   2412 
   2413 static void iadst16_sse2(__m128i *in0, __m128i *in1) {
   2414   array_transpose_16x16(in0, in1);
   2415   iadst16_8col(in0);
   2416   iadst16_8col(in1);
   2417 }
   2418 
   2419 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
   2420   in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
   2421   in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
   2422   in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
   2423   in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
   2424   in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
   2425   in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
   2426   in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
   2427   in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
   2428 
   2429   in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
   2430   in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
   2431   in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
   2432   in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
   2433   in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
   2434   in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
   2435   in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
   2436   in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
   2437 }
   2438 
   2439 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
   2440   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   2441   const __m128i zero = _mm_setzero_si128();
   2442   // Final rounding and shift
   2443   in[0] = _mm_adds_epi16(in[0], final_rounding);
   2444   in[1] = _mm_adds_epi16(in[1], final_rounding);
   2445   in[2] = _mm_adds_epi16(in[2], final_rounding);
   2446   in[3] = _mm_adds_epi16(in[3], final_rounding);
   2447   in[4] = _mm_adds_epi16(in[4], final_rounding);
   2448   in[5] = _mm_adds_epi16(in[5], final_rounding);
   2449   in[6] = _mm_adds_epi16(in[6], final_rounding);
   2450   in[7] = _mm_adds_epi16(in[7], final_rounding);
   2451   in[8] = _mm_adds_epi16(in[8], final_rounding);
   2452   in[9] = _mm_adds_epi16(in[9], final_rounding);
   2453   in[10] = _mm_adds_epi16(in[10], final_rounding);
   2454   in[11] = _mm_adds_epi16(in[11], final_rounding);
   2455   in[12] = _mm_adds_epi16(in[12], final_rounding);
   2456   in[13] = _mm_adds_epi16(in[13], final_rounding);
   2457   in[14] = _mm_adds_epi16(in[14], final_rounding);
   2458   in[15] = _mm_adds_epi16(in[15], final_rounding);
   2459 
   2460   in[0] = _mm_srai_epi16(in[0], 6);
   2461   in[1] = _mm_srai_epi16(in[1], 6);
   2462   in[2] = _mm_srai_epi16(in[2], 6);
   2463   in[3] = _mm_srai_epi16(in[3], 6);
   2464   in[4] = _mm_srai_epi16(in[4], 6);
   2465   in[5] = _mm_srai_epi16(in[5], 6);
   2466   in[6] = _mm_srai_epi16(in[6], 6);
   2467   in[7] = _mm_srai_epi16(in[7], 6);
   2468   in[8] = _mm_srai_epi16(in[8], 6);
   2469   in[9] = _mm_srai_epi16(in[9], 6);
   2470   in[10] = _mm_srai_epi16(in[10], 6);
   2471   in[11] = _mm_srai_epi16(in[11], 6);
   2472   in[12] = _mm_srai_epi16(in[12], 6);
   2473   in[13] = _mm_srai_epi16(in[13], 6);
   2474   in[14] = _mm_srai_epi16(in[14], 6);
   2475   in[15] = _mm_srai_epi16(in[15], 6);
   2476 
   2477   RECON_AND_STORE(dest, in[0]);
   2478   RECON_AND_STORE(dest, in[1]);
   2479   RECON_AND_STORE(dest, in[2]);
   2480   RECON_AND_STORE(dest, in[3]);
   2481   RECON_AND_STORE(dest, in[4]);
   2482   RECON_AND_STORE(dest, in[5]);
   2483   RECON_AND_STORE(dest, in[6]);
   2484   RECON_AND_STORE(dest, in[7]);
   2485   RECON_AND_STORE(dest, in[8]);
   2486   RECON_AND_STORE(dest, in[9]);
   2487   RECON_AND_STORE(dest, in[10]);
   2488   RECON_AND_STORE(dest, in[11]);
   2489   RECON_AND_STORE(dest, in[12]);
   2490   RECON_AND_STORE(dest, in[13]);
   2491   RECON_AND_STORE(dest, in[14]);
   2492   RECON_AND_STORE(dest, in[15]);
   2493 }
   2494 
   2495 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   2496                                int tx_type) {
   2497   __m128i in0[16], in1[16];
   2498 
   2499   load_buffer_8x16(input, in0);
   2500   input += 8;
   2501   load_buffer_8x16(input, in1);
   2502 
   2503   switch (tx_type) {
   2504     case 0:  // DCT_DCT
   2505       idct16_sse2(in0, in1);
   2506       idct16_sse2(in0, in1);
   2507       break;
   2508     case 1:  // ADST_DCT
   2509       idct16_sse2(in0, in1);
   2510       iadst16_sse2(in0, in1);
   2511       break;
   2512     case 2:  // DCT_ADST
   2513       iadst16_sse2(in0, in1);
   2514       idct16_sse2(in0, in1);
   2515       break;
   2516     case 3:  // ADST_ADST
   2517       iadst16_sse2(in0, in1);
   2518       iadst16_sse2(in0, in1);
   2519       break;
   2520     default:
   2521       assert(0);
   2522       break;
   2523   }
   2524 
   2525   write_buffer_8x16(dest, in0, stride);
   2526   dest += 8;
   2527   write_buffer_8x16(dest, in1, stride);
   2528 }
   2529 
   2530 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   2531                                int stride) {
   2532   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   2533   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   2534   const __m128i zero = _mm_setzero_si128();
   2535 
   2536   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   2537   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   2538   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   2539   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   2540 
   2541   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   2542   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   2543 
   2544   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   2545   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   2546   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   2547   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   2548   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   2549   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   2550 
   2551   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   2552   __m128i in[16], l[16];
   2553   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
   2554           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
   2555           stp1_8_0, stp1_12_0;
   2556   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   2557           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
   2558   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   2559   int i;
   2560   // First 1-D inverse DCT
   2561   // Load input data.
   2562   in[0] = _mm_load_si128((const __m128i *)input);
   2563   in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
   2564   in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
   2565   in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
   2566 
   2567   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
   2568 
   2569   // Stage2
   2570   {
   2571     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
   2572     const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
   2573 
   2574     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
   2575     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
   2576     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
   2577     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
   2578 
   2579     tmp0 = _mm_add_epi32(tmp0, rounding);
   2580     tmp2 = _mm_add_epi32(tmp2, rounding);
   2581     tmp5 = _mm_add_epi32(tmp5, rounding);
   2582     tmp7 = _mm_add_epi32(tmp7, rounding);
   2583 
   2584     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2585     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2586     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
   2587     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
   2588 
   2589     stp2_8  = _mm_packs_epi32(tmp0, tmp2);
   2590     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
   2591   }
   2592 
   2593   // Stage3
   2594   {
   2595     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
   2596 
   2597     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
   2598     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
   2599 
   2600     tmp0 = _mm_add_epi32(tmp0, rounding);
   2601     tmp2 = _mm_add_epi32(tmp2, rounding);
   2602     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2603     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2604 
   2605     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
   2606     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
   2607 
   2608     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
   2609   }
   2610 
   2611   // Stage4
   2612   {
   2613     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
   2614     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
   2615     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
   2616 
   2617     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
   2618     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
   2619     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
   2620     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
   2621     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
   2622     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
   2623 
   2624     tmp0 = _mm_add_epi32(tmp0, rounding);
   2625     tmp2 = _mm_add_epi32(tmp2, rounding);
   2626     tmp1 = _mm_add_epi32(tmp1, rounding);
   2627     tmp3 = _mm_add_epi32(tmp3, rounding);
   2628     tmp5 = _mm_add_epi32(tmp5, rounding);
   2629     tmp7 = _mm_add_epi32(tmp7, rounding);
   2630 
   2631     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2632     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2633     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
   2634     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
   2635     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
   2636     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
   2637 
   2638     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
   2639     stp1_1 = _mm_packs_epi32(tmp2, tmp2);
   2640     stp2_9 = _mm_packs_epi32(tmp1, tmp3);
   2641     stp2_10 = _mm_packs_epi32(tmp5, tmp7);
   2642 
   2643     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
   2644   }
   2645 
   2646   // Stage5 and Stage6
   2647   {
   2648     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
   2649     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
   2650     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
   2651     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
   2652 
   2653     stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
   2654     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
   2655     stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
   2656     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
   2657 
   2658     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
   2659     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
   2660     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
   2661     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
   2662   }
   2663 
   2664   // Stage6
   2665   {
   2666     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
   2667     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
   2668     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
   2669 
   2670     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
   2671     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
   2672     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
   2673     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
   2674     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
   2675     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
   2676 
   2677     tmp1 = _mm_add_epi32(tmp1, rounding);
   2678     tmp3 = _mm_add_epi32(tmp3, rounding);
   2679     tmp0 = _mm_add_epi32(tmp0, rounding);
   2680     tmp2 = _mm_add_epi32(tmp2, rounding);
   2681     tmp4 = _mm_add_epi32(tmp4, rounding);
   2682     tmp6 = _mm_add_epi32(tmp6, rounding);
   2683 
   2684     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
   2685     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
   2686     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
   2687     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
   2688     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
   2689     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
   2690 
   2691     stp1_6 = _mm_packs_epi32(tmp3, tmp1);
   2692 
   2693     stp2_10 = _mm_packs_epi32(tmp0, zero);
   2694     stp2_13 = _mm_packs_epi32(tmp2, zero);
   2695     stp2_11 = _mm_packs_epi32(tmp4, zero);
   2696     stp2_12 = _mm_packs_epi32(tmp6, zero);
   2697 
   2698     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
   2699     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
   2700     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
   2701     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
   2702 
   2703     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
   2704     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
   2705     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
   2706     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
   2707     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
   2708     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
   2709     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
   2710     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
   2711   }
   2712 
   2713   // Stage7. Left 8x16 only.
   2714   l[0] = _mm_add_epi16(stp2_0, stp1_15);
   2715   l[1] = _mm_add_epi16(stp2_1, stp1_14);
   2716   l[2] = _mm_add_epi16(stp2_2, stp2_13);
   2717   l[3] = _mm_add_epi16(stp2_3, stp2_12);
   2718   l[4] = _mm_add_epi16(stp2_4, stp2_11);
   2719   l[5] = _mm_add_epi16(stp2_5, stp2_10);
   2720   l[6] = _mm_add_epi16(stp2_6, stp1_9);
   2721   l[7] = _mm_add_epi16(stp2_7, stp1_8);
   2722   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
   2723   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
   2724   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
   2725   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
   2726   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
   2727   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
   2728   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
   2729   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
   2730 
   2731   // Second 1-D inverse transform, performed per 8x16 block
   2732   for (i = 0; i < 2; i++) {
   2733     array_transpose_4X8(l + 8*i, in);
   2734 
   2735     IDCT16_10
   2736 
   2737     // Stage7
   2738     in[0] = _mm_add_epi16(stp2_0, stp1_15);
   2739     in[1] = _mm_add_epi16(stp2_1, stp1_14);
   2740     in[2] = _mm_add_epi16(stp2_2, stp2_13);
   2741     in[3] = _mm_add_epi16(stp2_3, stp2_12);
   2742     in[4] = _mm_add_epi16(stp2_4, stp2_11);
   2743     in[5] = _mm_add_epi16(stp2_5, stp2_10);
   2744     in[6] = _mm_add_epi16(stp2_6, stp1_9);
   2745     in[7] = _mm_add_epi16(stp2_7, stp1_8);
   2746     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
   2747     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
   2748     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
   2749     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
   2750     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
   2751     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
   2752     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
   2753     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
   2754 
   2755     // Final rounding and shift
   2756     in[0] = _mm_adds_epi16(in[0], final_rounding);
   2757     in[1] = _mm_adds_epi16(in[1], final_rounding);
   2758     in[2] = _mm_adds_epi16(in[2], final_rounding);
   2759     in[3] = _mm_adds_epi16(in[3], final_rounding);
   2760     in[4] = _mm_adds_epi16(in[4], final_rounding);
   2761     in[5] = _mm_adds_epi16(in[5], final_rounding);
   2762     in[6] = _mm_adds_epi16(in[6], final_rounding);
   2763     in[7] = _mm_adds_epi16(in[7], final_rounding);
   2764     in[8] = _mm_adds_epi16(in[8], final_rounding);
   2765     in[9] = _mm_adds_epi16(in[9], final_rounding);
   2766     in[10] = _mm_adds_epi16(in[10], final_rounding);
   2767     in[11] = _mm_adds_epi16(in[11], final_rounding);
   2768     in[12] = _mm_adds_epi16(in[12], final_rounding);
   2769     in[13] = _mm_adds_epi16(in[13], final_rounding);
   2770     in[14] = _mm_adds_epi16(in[14], final_rounding);
   2771     in[15] = _mm_adds_epi16(in[15], final_rounding);
   2772 
   2773     in[0] = _mm_srai_epi16(in[0], 6);
   2774     in[1] = _mm_srai_epi16(in[1], 6);
   2775     in[2] = _mm_srai_epi16(in[2], 6);
   2776     in[3] = _mm_srai_epi16(in[3], 6);
   2777     in[4] = _mm_srai_epi16(in[4], 6);
   2778     in[5] = _mm_srai_epi16(in[5], 6);
   2779     in[6] = _mm_srai_epi16(in[6], 6);
   2780     in[7] = _mm_srai_epi16(in[7], 6);
   2781     in[8] = _mm_srai_epi16(in[8], 6);
   2782     in[9] = _mm_srai_epi16(in[9], 6);
   2783     in[10] = _mm_srai_epi16(in[10], 6);
   2784     in[11] = _mm_srai_epi16(in[11], 6);
   2785     in[12] = _mm_srai_epi16(in[12], 6);
   2786     in[13] = _mm_srai_epi16(in[13], 6);
   2787     in[14] = _mm_srai_epi16(in[14], 6);
   2788     in[15] = _mm_srai_epi16(in[15], 6);
   2789 
   2790     RECON_AND_STORE(dest, in[0]);
   2791     RECON_AND_STORE(dest, in[1]);
   2792     RECON_AND_STORE(dest, in[2]);
   2793     RECON_AND_STORE(dest, in[3]);
   2794     RECON_AND_STORE(dest, in[4]);
   2795     RECON_AND_STORE(dest, in[5]);
   2796     RECON_AND_STORE(dest, in[6]);
   2797     RECON_AND_STORE(dest, in[7]);
   2798     RECON_AND_STORE(dest, in[8]);
   2799     RECON_AND_STORE(dest, in[9]);
   2800     RECON_AND_STORE(dest, in[10]);
   2801     RECON_AND_STORE(dest, in[11]);
   2802     RECON_AND_STORE(dest, in[12]);
   2803     RECON_AND_STORE(dest, in[13]);
   2804     RECON_AND_STORE(dest, in[14]);
   2805     RECON_AND_STORE(dest, in[15]);
   2806 
   2807     dest += 8 - (stride * 16);
   2808   }
   2809 }
   2810 
   2811 #define LOAD_DQCOEFF(reg, input) \
   2812   {  \
   2813     reg = _mm_load_si128((const __m128i *) input); \
   2814     input += 8; \
   2815   }  \
   2816 
   2817 #define IDCT32_34 \
   2818 /* Stage1 */ \
   2819 { \
   2820   const __m128i zero = _mm_setzero_si128();\
   2821   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
   2822   const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
   2823   \
   2824   const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
   2825   const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
   2826   \
   2827   const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
   2828   const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
   2829   \
   2830   const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
   2831   const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
   2832   \
   2833   MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
   2834                          stg1_1, stp1_16, stp1_31); \
   2835   MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
   2836                          stg1_7, stp1_19, stp1_28); \
   2837   MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
   2838                          stg1_9, stp1_20, stp1_27); \
   2839   MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
   2840                          stg1_15, stp1_23, stp1_24); \
   2841 } \
   2842 \
   2843 /* Stage2 */ \
   2844 { \
   2845   const __m128i zero = _mm_setzero_si128();\
   2846   const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
   2847   const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
   2848   \
   2849   const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
   2850   const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
   2851   \
   2852   MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
   2853                          stg2_1, stp2_8, stp2_15); \
   2854   MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
   2855                          stg2_7, stp2_11, stp2_12); \
   2856   \
   2857   stp2_16 = stp1_16; \
   2858   stp2_19 = stp1_19; \
   2859   \
   2860   stp2_20 = stp1_20; \
   2861   stp2_23 = stp1_23; \
   2862   \
   2863   stp2_24 = stp1_24; \
   2864   stp2_27 = stp1_27; \
   2865   \
   2866   stp2_28 = stp1_28; \
   2867   stp2_31 = stp1_31; \
   2868 } \
   2869 \
   2870 /* Stage3 */ \
   2871 { \
   2872   const __m128i zero = _mm_setzero_si128();\
   2873   const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
   2874   const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
   2875   \
   2876   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
   2877   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
   2878   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
   2879   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
   2880   \
   2881   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
   2882   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
   2883   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
   2884   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
   2885   \
   2886   MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
   2887                          stg3_1, stp1_4, stp1_7); \
   2888   \
   2889   stp1_8 = stp2_8; \
   2890   stp1_11 = stp2_11; \
   2891   stp1_12 = stp2_12; \
   2892   stp1_15 = stp2_15; \
   2893   \
   2894   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
   2895                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
   2896                          stp1_18, stp1_29) \
   2897   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
   2898                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
   2899                          stp1_22, stp1_25) \
   2900   \
   2901   stp1_16 = stp2_16; \
   2902   stp1_31 = stp2_31; \
   2903   stp1_19 = stp2_19; \
   2904   stp1_20 = stp2_20; \
   2905   stp1_23 = stp2_23; \
   2906   stp1_24 = stp2_24; \
   2907   stp1_27 = stp2_27; \
   2908   stp1_28 = stp2_28; \
   2909 } \
   2910 \
   2911 /* Stage4 */ \
   2912 { \
   2913   const __m128i zero = _mm_setzero_si128();\
   2914   const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
   2915   const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
   2916   \
   2917   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
   2918   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
   2919   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
   2920   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
   2921   \
   2922   MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
   2923                          stg4_1, stp2_0, stp2_1); \
   2924   \
   2925   stp2_4 = stp1_4; \
   2926   stp2_5 = stp1_4; \
   2927   stp2_6 = stp1_7; \
   2928   stp2_7 = stp1_7; \
   2929   \
   2930   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
   2931                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
   2932                          stp2_10, stp2_13) \
   2933   \
   2934   stp2_8 = stp1_8; \
   2935   stp2_15 = stp1_15; \
   2936   stp2_11 = stp1_11; \
   2937   stp2_12 = stp1_12; \
   2938   \
   2939   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
   2940   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
   2941   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
   2942   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
   2943   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
   2944   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
   2945   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
   2946   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
   2947   \
   2948   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
   2949   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
   2950   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
   2951   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
   2952   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
   2953   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
   2954   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
   2955   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
   2956 } \
   2957 \
   2958 /* Stage5 */ \
   2959 { \
   2960   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
   2961   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
   2962   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
   2963   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
   2964   \
   2965   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
   2966   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
   2967   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
   2968   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
   2969   \
   2970   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
   2971   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
   2972   \
   2973   stp1_0 = stp2_0; \
   2974   stp1_1 = stp2_1; \
   2975   stp1_2 = stp2_1; \
   2976   stp1_3 = stp2_0; \
   2977   \
   2978   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
   2979   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
   2980   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
   2981   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
   2982   \
   2983   tmp0 = _mm_add_epi32(tmp0, rounding); \
   2984   tmp1 = _mm_add_epi32(tmp1, rounding); \
   2985   tmp2 = _mm_add_epi32(tmp2, rounding); \
   2986   tmp3 = _mm_add_epi32(tmp3, rounding); \
   2987   \
   2988   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
   2989   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
   2990   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
   2991   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
   2992   \
   2993   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
   2994   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
   2995   \
   2996   stp1_4 = stp2_4; \
   2997   stp1_7 = stp2_7; \
   2998   \
   2999   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
   3000   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
   3001   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
   3002   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
   3003   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
   3004   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
   3005   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
   3006   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
   3007   \
   3008   stp1_16 = stp2_16; \
   3009   stp1_17 = stp2_17; \
   3010   \
   3011   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
   3012                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
   3013                          stp1_19, stp1_28) \
   3014   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
   3015                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
   3016                          stp1_21, stp1_26) \
   3017   \
   3018   stp1_22 = stp2_22; \
   3019   stp1_23 = stp2_23; \
   3020   stp1_24 = stp2_24; \
   3021   stp1_25 = stp2_25; \
   3022   stp1_30 = stp2_30; \
   3023   stp1_31 = stp2_31; \
   3024 } \
   3025 \
   3026 /* Stage6 */ \
   3027 { \
   3028   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   3029   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   3030   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
   3031   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
   3032   \
   3033   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
   3034   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
   3035   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
   3036   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
   3037   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
   3038   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
   3039   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
   3040   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
   3041   \
   3042   stp2_8 = stp1_8; \
   3043   stp2_9 = stp1_9; \
   3044   stp2_14 = stp1_14; \
   3045   stp2_15 = stp1_15; \
   3046   \
   3047   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
   3048                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
   3049                          stp2_13, stp2_11, stp2_12) \
   3050   \
   3051   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
   3052   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
   3053   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
   3054   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
   3055   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
   3056   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
   3057   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
   3058   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
   3059   \
   3060   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
   3061   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
   3062   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
   3063   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
   3064   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
   3065   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
   3066   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
   3067   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
   3068 } \
   3069 \
   3070 /* Stage7 */ \
   3071 { \
   3072   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
   3073   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
   3074   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
   3075   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
   3076   \
   3077   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
   3078   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
   3079   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
   3080   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
   3081   \
   3082   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
   3083   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
   3084   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
   3085   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
   3086   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
   3087   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
   3088   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
   3089   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
   3090   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
   3091   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
   3092   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
   3093   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
   3094   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
   3095   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
   3096   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
   3097   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
   3098   \
   3099   stp1_16 = stp2_16; \
   3100   stp1_17 = stp2_17; \
   3101   stp1_18 = stp2_18; \
   3102   stp1_19 = stp2_19; \
   3103   \
   3104   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
   3105                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
   3106                          stp1_21, stp1_26) \
   3107   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
   3108                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
   3109                          stp1_23, stp1_24) \
   3110   \
   3111   stp1_28 = stp2_28; \
   3112   stp1_29 = stp2_29; \
   3113   stp1_30 = stp2_30; \
   3114   stp1_31 = stp2_31; \
   3115 }
   3116 
   3117 
   3118 #define IDCT32 \
   3119 /* Stage1 */ \
   3120 { \
   3121   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
   3122   const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
   3123   const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
   3124   const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
   3125   \
   3126   const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
   3127   const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
   3128   const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
   3129   const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
   3130   \
   3131   const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
   3132   const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
   3133   const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
   3134   const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
   3135   \
   3136   const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
   3137   const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
   3138   const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
   3139   const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
   3140   \
   3141   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
   3142                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
   3143                          stp1_17, stp1_30) \
   3144   MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
   3145                          stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
   3146                          stp1_19, stp1_28) \
   3147   MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
   3148                          stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
   3149                          stp1_21, stp1_26) \
   3150   MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
   3151                          stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
   3152                          stp1_23, stp1_24) \
   3153 } \
   3154 \
   3155 /* Stage2 */ \
   3156 { \
   3157   const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
   3158   const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
   3159   const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
   3160   const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
   3161   \
   3162   const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
   3163   const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
   3164   const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
   3165   const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
   3166   \
   3167   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
   3168                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
   3169                          stp2_14) \
   3170   MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
   3171                          stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
   3172                          stp2_11, stp2_12) \
   3173   \
   3174   stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
   3175   stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
   3176   stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
   3177   stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
   3178   \
   3179   stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
   3180   stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
   3181   stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
   3182   stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
   3183   \
   3184   stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
   3185   stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
   3186   stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
   3187   stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
   3188   \
   3189   stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
   3190   stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
   3191   stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
   3192   stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
   3193 } \
   3194 \
   3195 /* Stage3 */ \
   3196 { \
   3197   const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
   3198   const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
   3199   const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
   3200   const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
   3201   \
   3202   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
   3203   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
   3204   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
   3205   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
   3206   \
   3207   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
   3208   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
   3209   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
   3210   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
   3211   \
   3212   MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
   3213                          stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
   3214                          stp1_6) \
   3215   \
   3216   stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
   3217   stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
   3218   stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
   3219   stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
   3220   stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
   3221   stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
   3222   stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
   3223   stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
   3224   \
   3225   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
   3226                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
   3227                          stp1_18, stp1_29) \
   3228   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
   3229                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
   3230                          stp1_22, stp1_25) \
   3231   \
   3232   stp1_16 = stp2_16; \
   3233   stp1_31 = stp2_31; \
   3234   stp1_19 = stp2_19; \
   3235   stp1_20 = stp2_20; \
   3236   stp1_23 = stp2_23; \
   3237   stp1_24 = stp2_24; \
   3238   stp1_27 = stp2_27; \
   3239   stp1_28 = stp2_28; \
   3240 } \
   3241 \
   3242 /* Stage4 */ \
   3243 { \
   3244   const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
   3245   const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
   3246   const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
   3247   const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
   3248   \
   3249   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   3250   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
   3251   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   3252   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   3253   \
   3254   MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
   3255                          stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
   3256                          stp2_2, stp2_3) \
   3257   \
   3258   stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
   3259   stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
   3260   stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
   3261   stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
   3262   \
   3263   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
   3264                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
   3265                          stp2_10, stp2_13) \
   3266   \
   3267   stp2_8 = stp1_8; \
   3268   stp2_15 = stp1_15; \
   3269   stp2_11 = stp1_11; \
   3270   stp2_12 = stp1_12; \
   3271   \
   3272   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
   3273   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
   3274   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
   3275   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
   3276   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
   3277   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
   3278   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
   3279   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
   3280   \
   3281   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
   3282   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
   3283   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
   3284   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
   3285   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
   3286   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
   3287   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
   3288   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
   3289 } \
   3290 \
   3291 /* Stage5 */ \
   3292 { \
   3293   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
   3294   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
   3295   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
   3296   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
   3297   \
   3298   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
   3299   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
   3300   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
   3301   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
   3302   \
   3303   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
   3304   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
   3305   \
   3306   stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
   3307   stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
   3308   stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
   3309   stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
   3310   \
   3311   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
   3312   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
   3313   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
   3314   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
   3315   \
   3316   tmp0 = _mm_add_epi32(tmp0, rounding); \
   3317   tmp1 = _mm_add_epi32(tmp1, rounding); \
   3318   tmp2 = _mm_add_epi32(tmp2, rounding); \
   3319   tmp3 = _mm_add_epi32(tmp3, rounding); \
   3320   \
   3321   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
   3322   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
   3323   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
   3324   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
   3325   \
   3326   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
   3327   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
   3328   \
   3329   stp1_4 = stp2_4; \
   3330   stp1_7 = stp2_7; \
   3331   \
   3332   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
   3333   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
   3334   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
   3335   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
   3336   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
   3337   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
   3338   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
   3339   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
   3340   \
   3341   stp1_16 = stp2_16; \
   3342   stp1_17 = stp2_17; \
   3343   \
   3344   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
   3345                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
   3346                          stp1_19, stp1_28) \
   3347   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
   3348                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
   3349                          stp1_21, stp1_26) \
   3350   \
   3351   stp1_22 = stp2_22; \
   3352   stp1_23 = stp2_23; \
   3353   stp1_24 = stp2_24; \
   3354   stp1_25 = stp2_25; \
   3355   stp1_30 = stp2_30; \
   3356   stp1_31 = stp2_31; \
   3357 } \
   3358 \
   3359 /* Stage6 */ \
   3360 { \
   3361   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
   3362   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
   3363   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
   3364   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
   3365   \
   3366   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
   3367   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
   3368   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
   3369   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
   3370   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
   3371   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
   3372   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
   3373   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
   3374   \
   3375   stp2_8 = stp1_8; \
   3376   stp2_9 = stp1_9; \
   3377   stp2_14 = stp1_14; \
   3378   stp2_15 = stp1_15; \
   3379   \
   3380   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
   3381                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
   3382                          stp2_13, stp2_11, stp2_12) \
   3383   \
   3384   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
   3385   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
   3386   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
   3387   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
   3388   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
   3389   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
   3390   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
   3391   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
   3392   \
   3393   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
   3394   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
   3395   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
   3396   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
   3397   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
   3398   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
   3399   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
   3400   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
   3401 } \
   3402 \
   3403 /* Stage7 */ \
   3404 { \
   3405   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
   3406   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
   3407   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
   3408   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
   3409   \
   3410   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
   3411   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
   3412   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
   3413   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
   3414   \
   3415   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
   3416   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
   3417   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
   3418   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
   3419   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
   3420   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
   3421   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
   3422   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
   3423   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
   3424   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
   3425   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
   3426   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
   3427   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
   3428   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
   3429   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
   3430   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
   3431   \
   3432   stp1_16 = stp2_16; \
   3433   stp1_17 = stp2_17; \
   3434   stp1_18 = stp2_18; \
   3435   stp1_19 = stp2_19; \
   3436   \
   3437   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
   3438                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
   3439                          stp1_21, stp1_26) \
   3440   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
   3441                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
   3442                          stp1_23, stp1_24) \
   3443   \
   3444   stp1_28 = stp2_28; \
   3445   stp1_29 = stp2_29; \
   3446   stp1_30 = stp2_30; \
   3447   stp1_31 = stp2_31; \
   3448 }
   3449 
   3450 // Only upper-left 8x8 has non-zero coeff
   3451 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
   3452                                  int stride) {
   3453   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   3454   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   3455 
   3456   // idct constants for each stage
   3457   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   3458   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
   3459   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   3460   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
   3461   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   3462   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
   3463   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   3464   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
   3465   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   3466   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
   3467   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   3468   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   3469   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   3470   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
   3471   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   3472   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
   3473 
   3474   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   3475   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   3476   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
   3477   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
   3478   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
   3479   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
   3480   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   3481   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   3482 
   3483   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   3484   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   3485   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   3486   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
   3487   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   3488   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
   3489   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
   3490   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   3491   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
   3492   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
   3493 
   3494   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   3495   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   3496   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   3497   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   3498   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   3499   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   3500   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   3501 
   3502   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   3503 
   3504   __m128i in[32], col[32];
   3505   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
   3506           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
   3507           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
   3508           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
   3509           stp1_30, stp1_31;
   3510   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   3511           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
   3512           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
   3513           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
   3514           stp2_30, stp2_31;
   3515   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   3516   int i;
   3517   // Load input data.
   3518   LOAD_DQCOEFF(in[0], input);
   3519   LOAD_DQCOEFF(in[8], input);
   3520   LOAD_DQCOEFF(in[16], input);
   3521   LOAD_DQCOEFF(in[24], input);
   3522   LOAD_DQCOEFF(in[1], input);
   3523   LOAD_DQCOEFF(in[9], input);
   3524   LOAD_DQCOEFF(in[17], input);
   3525   LOAD_DQCOEFF(in[25], input);
   3526   LOAD_DQCOEFF(in[2], input);
   3527   LOAD_DQCOEFF(in[10], input);
   3528   LOAD_DQCOEFF(in[18], input);
   3529   LOAD_DQCOEFF(in[26], input);
   3530   LOAD_DQCOEFF(in[3], input);
   3531   LOAD_DQCOEFF(in[11], input);
   3532   LOAD_DQCOEFF(in[19], input);
   3533   LOAD_DQCOEFF(in[27], input);
   3534 
   3535   LOAD_DQCOEFF(in[4], input);
   3536   LOAD_DQCOEFF(in[12], input);
   3537   LOAD_DQCOEFF(in[20], input);
   3538   LOAD_DQCOEFF(in[28], input);
   3539   LOAD_DQCOEFF(in[5], input);
   3540   LOAD_DQCOEFF(in[13], input);
   3541   LOAD_DQCOEFF(in[21], input);
   3542   LOAD_DQCOEFF(in[29], input);
   3543   LOAD_DQCOEFF(in[6], input);
   3544   LOAD_DQCOEFF(in[14], input);
   3545   LOAD_DQCOEFF(in[22], input);
   3546   LOAD_DQCOEFF(in[30], input);
   3547   LOAD_DQCOEFF(in[7], input);
   3548   LOAD_DQCOEFF(in[15], input);
   3549   LOAD_DQCOEFF(in[23], input);
   3550   LOAD_DQCOEFF(in[31], input);
   3551 
   3552   array_transpose_8x8(in, in);
   3553   array_transpose_8x8(in+8, in+8);
   3554   array_transpose_8x8(in+16, in+16);
   3555   array_transpose_8x8(in+24, in+24);
   3556 
   3557   IDCT32
   3558 
   3559   // 1_D: Store 32 intermediate results for each 8x32 block.
   3560   col[0] = _mm_add_epi16(stp1_0, stp1_31);
   3561   col[1] = _mm_add_epi16(stp1_1, stp1_30);
   3562   col[2] = _mm_add_epi16(stp1_2, stp1_29);
   3563   col[3] = _mm_add_epi16(stp1_3, stp1_28);
   3564   col[4] = _mm_add_epi16(stp1_4, stp1_27);
   3565   col[5] = _mm_add_epi16(stp1_5, stp1_26);
   3566   col[6] = _mm_add_epi16(stp1_6, stp1_25);
   3567   col[7] = _mm_add_epi16(stp1_7, stp1_24);
   3568   col[8] = _mm_add_epi16(stp1_8, stp1_23);
   3569   col[9] = _mm_add_epi16(stp1_9, stp1_22);
   3570   col[10] = _mm_add_epi16(stp1_10, stp1_21);
   3571   col[11] = _mm_add_epi16(stp1_11, stp1_20);
   3572   col[12] = _mm_add_epi16(stp1_12, stp1_19);
   3573   col[13] = _mm_add_epi16(stp1_13, stp1_18);
   3574   col[14] = _mm_add_epi16(stp1_14, stp1_17);
   3575   col[15] = _mm_add_epi16(stp1_15, stp1_16);
   3576   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
   3577   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
   3578   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
   3579   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
   3580   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
   3581   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
   3582   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
   3583   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
   3584   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
   3585   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
   3586   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
   3587   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
   3588   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
   3589   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
   3590   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
   3591   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
   3592   for (i = 0; i < 4; i++) {
   3593       const __m128i zero = _mm_setzero_si128();
   3594       // Transpose 32x8 block to 8x32 block
   3595       array_transpose_8x8(col+i*8, in);
   3596       IDCT32_34
   3597 
   3598       // 2_D: Calculate the results and store them to destination.
   3599       in[0] = _mm_add_epi16(stp1_0, stp1_31);
   3600       in[1] = _mm_add_epi16(stp1_1, stp1_30);
   3601       in[2] = _mm_add_epi16(stp1_2, stp1_29);
   3602       in[3] = _mm_add_epi16(stp1_3, stp1_28);
   3603       in[4] = _mm_add_epi16(stp1_4, stp1_27);
   3604       in[5] = _mm_add_epi16(stp1_5, stp1_26);
   3605       in[6] = _mm_add_epi16(stp1_6, stp1_25);
   3606       in[7] = _mm_add_epi16(stp1_7, stp1_24);
   3607       in[8] = _mm_add_epi16(stp1_8, stp1_23);
   3608       in[9] = _mm_add_epi16(stp1_9, stp1_22);
   3609       in[10] = _mm_add_epi16(stp1_10, stp1_21);
   3610       in[11] = _mm_add_epi16(stp1_11, stp1_20);
   3611       in[12] = _mm_add_epi16(stp1_12, stp1_19);
   3612       in[13] = _mm_add_epi16(stp1_13, stp1_18);
   3613       in[14] = _mm_add_epi16(stp1_14, stp1_17);
   3614       in[15] = _mm_add_epi16(stp1_15, stp1_16);
   3615       in[16] = _mm_sub_epi16(stp1_15, stp1_16);
   3616       in[17] = _mm_sub_epi16(stp1_14, stp1_17);
   3617       in[18] = _mm_sub_epi16(stp1_13, stp1_18);
   3618       in[19] = _mm_sub_epi16(stp1_12, stp1_19);
   3619       in[20] = _mm_sub_epi16(stp1_11, stp1_20);
   3620       in[21] = _mm_sub_epi16(stp1_10, stp1_21);
   3621       in[22] = _mm_sub_epi16(stp1_9, stp1_22);
   3622       in[23] = _mm_sub_epi16(stp1_8, stp1_23);
   3623       in[24] = _mm_sub_epi16(stp1_7, stp1_24);
   3624       in[25] = _mm_sub_epi16(stp1_6, stp1_25);
   3625       in[26] = _mm_sub_epi16(stp1_5, stp1_26);
   3626       in[27] = _mm_sub_epi16(stp1_4, stp1_27);
   3627       in[28] = _mm_sub_epi16(stp1_3, stp1_28);
   3628       in[29] = _mm_sub_epi16(stp1_2, stp1_29);
   3629       in[30] = _mm_sub_epi16(stp1_1, stp1_30);
   3630       in[31] = _mm_sub_epi16(stp1_0, stp1_31);
   3631 
   3632       // Final rounding and shift
   3633       in[0] = _mm_adds_epi16(in[0], final_rounding);
   3634       in[1] = _mm_adds_epi16(in[1], final_rounding);
   3635       in[2] = _mm_adds_epi16(in[2], final_rounding);
   3636       in[3] = _mm_adds_epi16(in[3], final_rounding);
   3637       in[4] = _mm_adds_epi16(in[4], final_rounding);
   3638       in[5] = _mm_adds_epi16(in[5], final_rounding);
   3639       in[6] = _mm_adds_epi16(in[6], final_rounding);
   3640       in[7] = _mm_adds_epi16(in[7], final_rounding);
   3641       in[8] = _mm_adds_epi16(in[8], final_rounding);
   3642       in[9] = _mm_adds_epi16(in[9], final_rounding);
   3643       in[10] = _mm_adds_epi16(in[10], final_rounding);
   3644       in[11] = _mm_adds_epi16(in[11], final_rounding);
   3645       in[12] = _mm_adds_epi16(in[12], final_rounding);
   3646       in[13] = _mm_adds_epi16(in[13], final_rounding);
   3647       in[14] = _mm_adds_epi16(in[14], final_rounding);
   3648       in[15] = _mm_adds_epi16(in[15], final_rounding);
   3649       in[16] = _mm_adds_epi16(in[16], final_rounding);
   3650       in[17] = _mm_adds_epi16(in[17], final_rounding);
   3651       in[18] = _mm_adds_epi16(in[18], final_rounding);
   3652       in[19] = _mm_adds_epi16(in[19], final_rounding);
   3653       in[20] = _mm_adds_epi16(in[20], final_rounding);
   3654       in[21] = _mm_adds_epi16(in[21], final_rounding);
   3655       in[22] = _mm_adds_epi16(in[22], final_rounding);
   3656       in[23] = _mm_adds_epi16(in[23], final_rounding);
   3657       in[24] = _mm_adds_epi16(in[24], final_rounding);
   3658       in[25] = _mm_adds_epi16(in[25], final_rounding);
   3659       in[26] = _mm_adds_epi16(in[26], final_rounding);
   3660       in[27] = _mm_adds_epi16(in[27], final_rounding);
   3661       in[28] = _mm_adds_epi16(in[28], final_rounding);
   3662       in[29] = _mm_adds_epi16(in[29], final_rounding);
   3663       in[30] = _mm_adds_epi16(in[30], final_rounding);
   3664       in[31] = _mm_adds_epi16(in[31], final_rounding);
   3665 
   3666       in[0] = _mm_srai_epi16(in[0], 6);
   3667       in[1] = _mm_srai_epi16(in[1], 6);
   3668       in[2] = _mm_srai_epi16(in[2], 6);
   3669       in[3] = _mm_srai_epi16(in[3], 6);
   3670       in[4] = _mm_srai_epi16(in[4], 6);
   3671       in[5] = _mm_srai_epi16(in[5], 6);
   3672       in[6] = _mm_srai_epi16(in[6], 6);
   3673       in[7] = _mm_srai_epi16(in[7], 6);
   3674       in[8] = _mm_srai_epi16(in[8], 6);
   3675       in[9] = _mm_srai_epi16(in[9], 6);
   3676       in[10] = _mm_srai_epi16(in[10], 6);
   3677       in[11] = _mm_srai_epi16(in[11], 6);
   3678       in[12] = _mm_srai_epi16(in[12], 6);
   3679       in[13] = _mm_srai_epi16(in[13], 6);
   3680       in[14] = _mm_srai_epi16(in[14], 6);
   3681       in[15] = _mm_srai_epi16(in[15], 6);
   3682       in[16] = _mm_srai_epi16(in[16], 6);
   3683       in[17] = _mm_srai_epi16(in[17], 6);
   3684       in[18] = _mm_srai_epi16(in[18], 6);
   3685       in[19] = _mm_srai_epi16(in[19], 6);
   3686       in[20] = _mm_srai_epi16(in[20], 6);
   3687       in[21] = _mm_srai_epi16(in[21], 6);
   3688       in[22] = _mm_srai_epi16(in[22], 6);
   3689       in[23] = _mm_srai_epi16(in[23], 6);
   3690       in[24] = _mm_srai_epi16(in[24], 6);
   3691       in[25] = _mm_srai_epi16(in[25], 6);
   3692       in[26] = _mm_srai_epi16(in[26], 6);
   3693       in[27] = _mm_srai_epi16(in[27], 6);
   3694       in[28] = _mm_srai_epi16(in[28], 6);
   3695       in[29] = _mm_srai_epi16(in[29], 6);
   3696       in[30] = _mm_srai_epi16(in[30], 6);
   3697       in[31] = _mm_srai_epi16(in[31], 6);
   3698 
   3699       RECON_AND_STORE(dest, in[0]);
   3700       RECON_AND_STORE(dest, in[1]);
   3701       RECON_AND_STORE(dest, in[2]);
   3702       RECON_AND_STORE(dest, in[3]);
   3703       RECON_AND_STORE(dest, in[4]);
   3704       RECON_AND_STORE(dest, in[5]);
   3705       RECON_AND_STORE(dest, in[6]);
   3706       RECON_AND_STORE(dest, in[7]);
   3707       RECON_AND_STORE(dest, in[8]);
   3708       RECON_AND_STORE(dest, in[9]);
   3709       RECON_AND_STORE(dest, in[10]);
   3710       RECON_AND_STORE(dest, in[11]);
   3711       RECON_AND_STORE(dest, in[12]);
   3712       RECON_AND_STORE(dest, in[13]);
   3713       RECON_AND_STORE(dest, in[14]);
   3714       RECON_AND_STORE(dest, in[15]);
   3715       RECON_AND_STORE(dest, in[16]);
   3716       RECON_AND_STORE(dest, in[17]);
   3717       RECON_AND_STORE(dest, in[18]);
   3718       RECON_AND_STORE(dest, in[19]);
   3719       RECON_AND_STORE(dest, in[20]);
   3720       RECON_AND_STORE(dest, in[21]);
   3721       RECON_AND_STORE(dest, in[22]);
   3722       RECON_AND_STORE(dest, in[23]);
   3723       RECON_AND_STORE(dest, in[24]);
   3724       RECON_AND_STORE(dest, in[25]);
   3725       RECON_AND_STORE(dest, in[26]);
   3726       RECON_AND_STORE(dest, in[27]);
   3727       RECON_AND_STORE(dest, in[28]);
   3728       RECON_AND_STORE(dest, in[29]);
   3729       RECON_AND_STORE(dest, in[30]);
   3730       RECON_AND_STORE(dest, in[31]);
   3731 
   3732       dest += 8 - (stride * 32);
   3733     }
   3734   }
   3735 
   3736 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
   3737                                  int stride) {
   3738   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   3739   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   3740 
   3741   // idct constants for each stage
   3742   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   3743   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
   3744   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   3745   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
   3746   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   3747   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
   3748   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   3749   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
   3750   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   3751   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
   3752   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   3753   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   3754   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   3755   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
   3756   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   3757   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
   3758 
   3759   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   3760   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
   3761   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
   3762   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
   3763   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
   3764   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
   3765   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
   3766   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
   3767 
   3768   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   3769   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   3770   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   3771   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
   3772   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   3773   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
   3774   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
   3775   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   3776   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
   3777   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
   3778 
   3779   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
   3780   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   3781   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   3782   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
   3783   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   3784   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
   3785   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   3786 
   3787   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   3788 
   3789   __m128i in[32], col[128], zero_idx[16];
   3790   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
   3791           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
   3792           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
   3793           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
   3794           stp1_30, stp1_31;
   3795   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
   3796           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
   3797           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
   3798           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
   3799           stp2_30, stp2_31;
   3800   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   3801   int i, j, i32;
   3802   int zero_flag[2];
   3803 
   3804   for (i = 0; i < 4; i++) {
   3805     i32 = (i << 5);
   3806       // First 1-D idct
   3807       // Load input data.
   3808       LOAD_DQCOEFF(in[0], input);
   3809       LOAD_DQCOEFF(in[8], input);
   3810       LOAD_DQCOEFF(in[16], input);
   3811       LOAD_DQCOEFF(in[24], input);
   3812       LOAD_DQCOEFF(in[1], input);
   3813       LOAD_DQCOEFF(in[9], input);
   3814       LOAD_DQCOEFF(in[17], input);
   3815       LOAD_DQCOEFF(in[25], input);
   3816       LOAD_DQCOEFF(in[2], input);
   3817       LOAD_DQCOEFF(in[10], input);
   3818       LOAD_DQCOEFF(in[18], input);
   3819       LOAD_DQCOEFF(in[26], input);
   3820       LOAD_DQCOEFF(in[3], input);
   3821       LOAD_DQCOEFF(in[11], input);
   3822       LOAD_DQCOEFF(in[19], input);
   3823       LOAD_DQCOEFF(in[27], input);
   3824 
   3825       LOAD_DQCOEFF(in[4], input);
   3826       LOAD_DQCOEFF(in[12], input);
   3827       LOAD_DQCOEFF(in[20], input);
   3828       LOAD_DQCOEFF(in[28], input);
   3829       LOAD_DQCOEFF(in[5], input);
   3830       LOAD_DQCOEFF(in[13], input);
   3831       LOAD_DQCOEFF(in[21], input);
   3832       LOAD_DQCOEFF(in[29], input);
   3833       LOAD_DQCOEFF(in[6], input);
   3834       LOAD_DQCOEFF(in[14], input);
   3835       LOAD_DQCOEFF(in[22], input);
   3836       LOAD_DQCOEFF(in[30], input);
   3837       LOAD_DQCOEFF(in[7], input);
   3838       LOAD_DQCOEFF(in[15], input);
   3839       LOAD_DQCOEFF(in[23], input);
   3840       LOAD_DQCOEFF(in[31], input);
   3841 
   3842       // checking if all entries are zero
   3843       zero_idx[0] = _mm_or_si128(in[0], in[1]);
   3844       zero_idx[1] = _mm_or_si128(in[2], in[3]);
   3845       zero_idx[2] = _mm_or_si128(in[4], in[5]);
   3846       zero_idx[3] = _mm_or_si128(in[6], in[7]);
   3847       zero_idx[4] = _mm_or_si128(in[8], in[9]);
   3848       zero_idx[5] = _mm_or_si128(in[10], in[11]);
   3849       zero_idx[6] = _mm_or_si128(in[12], in[13]);
   3850       zero_idx[7] = _mm_or_si128(in[14], in[15]);
   3851       zero_idx[8] = _mm_or_si128(in[16], in[17]);
   3852       zero_idx[9] = _mm_or_si128(in[18], in[19]);
   3853       zero_idx[10] = _mm_or_si128(in[20], in[21]);
   3854       zero_idx[11] = _mm_or_si128(in[22], in[23]);
   3855       zero_idx[12] = _mm_or_si128(in[24], in[25]);
   3856       zero_idx[13] = _mm_or_si128(in[26], in[27]);
   3857       zero_idx[14] = _mm_or_si128(in[28], in[29]);
   3858       zero_idx[15] = _mm_or_si128(in[30], in[31]);
   3859 
   3860       zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
   3861       zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
   3862       zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
   3863       zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
   3864       zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
   3865       zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
   3866       zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
   3867       zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
   3868 
   3869       zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
   3870       zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
   3871       zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
   3872       zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
   3873       zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
   3874       zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
   3875       zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
   3876 
   3877       zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
   3878       zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
   3879       zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
   3880       zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
   3881       zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
   3882 
   3883       if (!zero_flag[0] && !zero_flag[1]) {
   3884         col[i32 + 0] = _mm_setzero_si128();
   3885         col[i32 + 1] = _mm_setzero_si128();
   3886         col[i32 + 2] = _mm_setzero_si128();
   3887         col[i32 + 3] = _mm_setzero_si128();
   3888         col[i32 + 4] = _mm_setzero_si128();
   3889         col[i32 + 5] = _mm_setzero_si128();
   3890         col[i32 + 6] = _mm_setzero_si128();
   3891         col[i32 + 7] = _mm_setzero_si128();
   3892         col[i32 + 8] = _mm_setzero_si128();
   3893         col[i32 + 9] = _mm_setzero_si128();
   3894         col[i32 + 10] = _mm_setzero_si128();
   3895         col[i32 + 11] = _mm_setzero_si128();
   3896         col[i32 + 12] = _mm_setzero_si128();
   3897         col[i32 + 13] = _mm_setzero_si128();
   3898         col[i32 + 14] = _mm_setzero_si128();
   3899         col[i32 + 15] = _mm_setzero_si128();
   3900         col[i32 + 16] = _mm_setzero_si128();
   3901         col[i32 + 17] = _mm_setzero_si128();
   3902         col[i32 + 18] = _mm_setzero_si128();
   3903         col[i32 + 19] = _mm_setzero_si128();
   3904         col[i32 + 20] = _mm_setzero_si128();
   3905         col[i32 + 21] = _mm_setzero_si128();
   3906         col[i32 + 22] = _mm_setzero_si128();
   3907         col[i32 + 23] = _mm_setzero_si128();
   3908         col[i32 + 24] = _mm_setzero_si128();
   3909         col[i32 + 25] = _mm_setzero_si128();
   3910         col[i32 + 26] = _mm_setzero_si128();
   3911         col[i32 + 27] = _mm_setzero_si128();
   3912         col[i32 + 28] = _mm_setzero_si128();
   3913         col[i32 + 29] = _mm_setzero_si128();
   3914         col[i32 + 30] = _mm_setzero_si128();
   3915         col[i32 + 31] = _mm_setzero_si128();
   3916         continue;
   3917       }
   3918 
   3919       // Transpose 32x8 block to 8x32 block
   3920       array_transpose_8x8(in, in);
   3921       array_transpose_8x8(in+8, in+8);
   3922       array_transpose_8x8(in+16, in+16);
   3923       array_transpose_8x8(in+24, in+24);
   3924 
   3925       IDCT32
   3926 
   3927       // 1_D: Store 32 intermediate results for each 8x32 block.
   3928       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
   3929       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
   3930       col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
   3931       col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
   3932       col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
   3933       col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
   3934       col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
   3935       col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
   3936       col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
   3937       col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
   3938       col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
   3939       col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
   3940       col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
   3941       col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
   3942       col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
   3943       col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
   3944       col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
   3945       col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
   3946       col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
   3947       col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
   3948       col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
   3949       col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
   3950       col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
   3951       col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
   3952       col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
   3953       col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
   3954       col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
   3955       col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
   3956       col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
   3957       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
   3958       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
   3959       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
   3960     }
   3961   for (i = 0; i < 4; i++) {
   3962       const __m128i zero = _mm_setzero_si128();
   3963       // Second 1-D idct
   3964       j = i << 3;
   3965 
   3966       // Transpose 32x8 block to 8x32 block
   3967       array_transpose_8x8(col+j, in);
   3968       array_transpose_8x8(col+j+32, in+8);
   3969       array_transpose_8x8(col+j+64, in+16);
   3970       array_transpose_8x8(col+j+96, in+24);
   3971 
   3972       IDCT32
   3973 
   3974       // 2_D: Calculate the results and store them to destination.
   3975       in[0] = _mm_add_epi16(stp1_0, stp1_31);
   3976       in[1] = _mm_add_epi16(stp1_1, stp1_30);
   3977       in[2] = _mm_add_epi16(stp1_2, stp1_29);
   3978       in[3] = _mm_add_epi16(stp1_3, stp1_28);
   3979       in[4] = _mm_add_epi16(stp1_4, stp1_27);
   3980       in[5] = _mm_add_epi16(stp1_5, stp1_26);
   3981       in[6] = _mm_add_epi16(stp1_6, stp1_25);
   3982       in[7] = _mm_add_epi16(stp1_7, stp1_24);
   3983       in[8] = _mm_add_epi16(stp1_8, stp1_23);
   3984       in[9] = _mm_add_epi16(stp1_9, stp1_22);
   3985       in[10] = _mm_add_epi16(stp1_10, stp1_21);
   3986       in[11] = _mm_add_epi16(stp1_11, stp1_20);
   3987       in[12] = _mm_add_epi16(stp1_12, stp1_19);
   3988       in[13] = _mm_add_epi16(stp1_13, stp1_18);
   3989       in[14] = _mm_add_epi16(stp1_14, stp1_17);
   3990       in[15] = _mm_add_epi16(stp1_15, stp1_16);
   3991       in[16] = _mm_sub_epi16(stp1_15, stp1_16);
   3992       in[17] = _mm_sub_epi16(stp1_14, stp1_17);
   3993       in[18] = _mm_sub_epi16(stp1_13, stp1_18);
   3994       in[19] = _mm_sub_epi16(stp1_12, stp1_19);
   3995       in[20] = _mm_sub_epi16(stp1_11, stp1_20);
   3996       in[21] = _mm_sub_epi16(stp1_10, stp1_21);
   3997       in[22] = _mm_sub_epi16(stp1_9, stp1_22);
   3998       in[23] = _mm_sub_epi16(stp1_8, stp1_23);
   3999       in[24] = _mm_sub_epi16(stp1_7, stp1_24);
   4000       in[25] = _mm_sub_epi16(stp1_6, stp1_25);
   4001       in[26] = _mm_sub_epi16(stp1_5, stp1_26);
   4002       in[27] = _mm_sub_epi16(stp1_4, stp1_27);
   4003       in[28] = _mm_sub_epi16(stp1_3, stp1_28);
   4004       in[29] = _mm_sub_epi16(stp1_2, stp1_29);
   4005       in[30] = _mm_sub_epi16(stp1_1, stp1_30);
   4006       in[31] = _mm_sub_epi16(stp1_0, stp1_31);
   4007 
   4008       // Final rounding and shift
   4009       in[0] = _mm_adds_epi16(in[0], final_rounding);
   4010       in[1] = _mm_adds_epi16(in[1], final_rounding);
   4011       in[2] = _mm_adds_epi16(in[2], final_rounding);
   4012       in[3] = _mm_adds_epi16(in[3], final_rounding);
   4013       in[4] = _mm_adds_epi16(in[4], final_rounding);
   4014       in[5] = _mm_adds_epi16(in[5], final_rounding);
   4015       in[6] = _mm_adds_epi16(in[6], final_rounding);
   4016       in[7] = _mm_adds_epi16(in[7], final_rounding);
   4017       in[8] = _mm_adds_epi16(in[8], final_rounding);
   4018       in[9] = _mm_adds_epi16(in[9], final_rounding);
   4019       in[10] = _mm_adds_epi16(in[10], final_rounding);
   4020       in[11] = _mm_adds_epi16(in[11], final_rounding);
   4021       in[12] = _mm_adds_epi16(in[12], final_rounding);
   4022       in[13] = _mm_adds_epi16(in[13], final_rounding);
   4023       in[14] = _mm_adds_epi16(in[14], final_rounding);
   4024       in[15] = _mm_adds_epi16(in[15], final_rounding);
   4025       in[16] = _mm_adds_epi16(in[16], final_rounding);
   4026       in[17] = _mm_adds_epi16(in[17], final_rounding);
   4027       in[18] = _mm_adds_epi16(in[18], final_rounding);
   4028       in[19] = _mm_adds_epi16(in[19], final_rounding);
   4029       in[20] = _mm_adds_epi16(in[20], final_rounding);
   4030       in[21] = _mm_adds_epi16(in[21], final_rounding);
   4031       in[22] = _mm_adds_epi16(in[22], final_rounding);
   4032       in[23] = _mm_adds_epi16(in[23], final_rounding);
   4033       in[24] = _mm_adds_epi16(in[24], final_rounding);
   4034       in[25] = _mm_adds_epi16(in[25], final_rounding);
   4035       in[26] = _mm_adds_epi16(in[26], final_rounding);
   4036       in[27] = _mm_adds_epi16(in[27], final_rounding);
   4037       in[28] = _mm_adds_epi16(in[28], final_rounding);
   4038       in[29] = _mm_adds_epi16(in[29], final_rounding);
   4039       in[30] = _mm_adds_epi16(in[30], final_rounding);
   4040       in[31] = _mm_adds_epi16(in[31], final_rounding);
   4041 
   4042       in[0] = _mm_srai_epi16(in[0], 6);
   4043       in[1] = _mm_srai_epi16(in[1], 6);
   4044       in[2] = _mm_srai_epi16(in[2], 6);
   4045       in[3] = _mm_srai_epi16(in[3], 6);
   4046       in[4] = _mm_srai_epi16(in[4], 6);
   4047       in[5] = _mm_srai_epi16(in[5], 6);
   4048       in[6] = _mm_srai_epi16(in[6], 6);
   4049       in[7] = _mm_srai_epi16(in[7], 6);
   4050       in[8] = _mm_srai_epi16(in[8], 6);
   4051       in[9] = _mm_srai_epi16(in[9], 6);
   4052       in[10] = _mm_srai_epi16(in[10], 6);
   4053       in[11] = _mm_srai_epi16(in[11], 6);
   4054       in[12] = _mm_srai_epi16(in[12], 6);
   4055       in[13] = _mm_srai_epi16(in[13], 6);
   4056       in[14] = _mm_srai_epi16(in[14], 6);
   4057       in[15] = _mm_srai_epi16(in[15], 6);
   4058       in[16] = _mm_srai_epi16(in[16], 6);
   4059       in[17] = _mm_srai_epi16(in[17], 6);
   4060       in[18] = _mm_srai_epi16(in[18], 6);
   4061       in[19] = _mm_srai_epi16(in[19], 6);
   4062       in[20] = _mm_srai_epi16(in[20], 6);
   4063       in[21] = _mm_srai_epi16(in[21], 6);
   4064       in[22] = _mm_srai_epi16(in[22], 6);
   4065       in[23] = _mm_srai_epi16(in[23], 6);
   4066       in[24] = _mm_srai_epi16(in[24], 6);
   4067       in[25] = _mm_srai_epi16(in[25], 6);
   4068       in[26] = _mm_srai_epi16(in[26], 6);
   4069       in[27] = _mm_srai_epi16(in[27], 6);
   4070       in[28] = _mm_srai_epi16(in[28], 6);
   4071       in[29] = _mm_srai_epi16(in[29], 6);
   4072       in[30] = _mm_srai_epi16(in[30], 6);
   4073       in[31] = _mm_srai_epi16(in[31], 6);
   4074 
   4075       RECON_AND_STORE(dest, in[0]);
   4076       RECON_AND_STORE(dest, in[1]);
   4077       RECON_AND_STORE(dest, in[2]);
   4078       RECON_AND_STORE(dest, in[3]);
   4079       RECON_AND_STORE(dest, in[4]);
   4080       RECON_AND_STORE(dest, in[5]);
   4081       RECON_AND_STORE(dest, in[6]);
   4082       RECON_AND_STORE(dest, in[7]);
   4083       RECON_AND_STORE(dest, in[8]);
   4084       RECON_AND_STORE(dest, in[9]);
   4085       RECON_AND_STORE(dest, in[10]);
   4086       RECON_AND_STORE(dest, in[11]);
   4087       RECON_AND_STORE(dest, in[12]);
   4088       RECON_AND_STORE(dest, in[13]);
   4089       RECON_AND_STORE(dest, in[14]);
   4090       RECON_AND_STORE(dest, in[15]);
   4091       RECON_AND_STORE(dest, in[16]);
   4092       RECON_AND_STORE(dest, in[17]);
   4093       RECON_AND_STORE(dest, in[18]);
   4094       RECON_AND_STORE(dest, in[19]);
   4095       RECON_AND_STORE(dest, in[20]);
   4096       RECON_AND_STORE(dest, in[21]);
   4097       RECON_AND_STORE(dest, in[22]);
   4098       RECON_AND_STORE(dest, in[23]);
   4099       RECON_AND_STORE(dest, in[24]);
   4100       RECON_AND_STORE(dest, in[25]);
   4101       RECON_AND_STORE(dest, in[26]);
   4102       RECON_AND_STORE(dest, in[27]);
   4103       RECON_AND_STORE(dest, in[28]);
   4104       RECON_AND_STORE(dest, in[29]);
   4105       RECON_AND_STORE(dest, in[30]);
   4106       RECON_AND_STORE(dest, in[31]);
   4107 
   4108       dest += 8 - (stride * 32);
   4109     }
   4110 }  //NOLINT
   4111 
   4112 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   4113   __m128i dc_value;
   4114   const __m128i zero = _mm_setzero_si128();
   4115   int a, i;
   4116 
   4117   a = dct_const_round_shift(input[0] * cospi_16_64);
   4118   a = dct_const_round_shift(a * cospi_16_64);
   4119   a = ROUND_POWER_OF_TWO(a, 6);
   4120 
   4121   dc_value = _mm_set1_epi16(a);
   4122 
   4123   for (i = 0; i < 4; ++i) {
   4124     RECON_AND_STORE(dest, dc_value);
   4125     RECON_AND_STORE(dest, dc_value);
   4126     RECON_AND_STORE(dest, dc_value);
   4127     RECON_AND_STORE(dest, dc_value);
   4128     RECON_AND_STORE(dest, dc_value);
   4129     RECON_AND_STORE(dest, dc_value);
   4130     RECON_AND_STORE(dest, dc_value);
   4131     RECON_AND_STORE(dest, dc_value);
   4132     RECON_AND_STORE(dest, dc_value);
   4133     RECON_AND_STORE(dest, dc_value);
   4134     RECON_AND_STORE(dest, dc_value);
   4135     RECON_AND_STORE(dest, dc_value);
   4136     RECON_AND_STORE(dest, dc_value);
   4137     RECON_AND_STORE(dest, dc_value);
   4138     RECON_AND_STORE(dest, dc_value);
   4139     RECON_AND_STORE(dest, dc_value);
   4140     RECON_AND_STORE(dest, dc_value);
   4141     RECON_AND_STORE(dest, dc_value);
   4142     RECON_AND_STORE(dest, dc_value);
   4143     RECON_AND_STORE(dest, dc_value);
   4144     RECON_AND_STORE(dest, dc_value);
   4145     RECON_AND_STORE(dest, dc_value);
   4146     RECON_AND_STORE(dest, dc_value);
   4147     RECON_AND_STORE(dest, dc_value);
   4148     RECON_AND_STORE(dest, dc_value);
   4149     RECON_AND_STORE(dest, dc_value);
   4150     RECON_AND_STORE(dest, dc_value);
   4151     RECON_AND_STORE(dest, dc_value);
   4152     RECON_AND_STORE(dest, dc_value);
   4153     RECON_AND_STORE(dest, dc_value);
   4154     RECON_AND_STORE(dest, dc_value);
   4155     RECON_AND_STORE(dest, dc_value);
   4156     dest += 8 - (stride * 32);
   4157   }
   4158 }
   4159