Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 #include "vp9/common/vp9_idct.h"  // for cospi constants
     13 #include "vpx_ports/mem.h"
     14 
     15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
     16   // The 2D transform is done with two passes which are actually pretty
     17   // similar. In the first one, we transform the columns and transpose
     18   // the results. In the second one, we transform the rows. To achieve that,
     19   // as the first pass results are transposed, we tranpose the columns (that
     20   // is the transposed rows) and transpose the results (so that it goes back
     21   // in normal/row positions).
     22   int pass;
     23   // Constants
     24   //    When we use them, in one case, they are all the same. In all others
     25   //    it's a pair of them that we need to repeat four times. This is done
     26   //    by constructing the 32 bit constant corresponding to that pair.
     27   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
     28   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     29   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
     30   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     31   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     32   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
     33   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
     34   const __m128i kOne = _mm_set1_epi16(1);
     35   __m128i in0, in1, in2, in3;
     36   // Load inputs.
     37   {
     38     in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
     39     in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
     40     in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
     41     in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
     42     // x = x << 4
     43     in0 = _mm_slli_epi16(in0, 4);
     44     in1 = _mm_slli_epi16(in1, 4);
     45     in2 = _mm_slli_epi16(in2, 4);
     46     in3 = _mm_slli_epi16(in3, 4);
     47     // if (i == 0 && input[0]) input[0] += 1;
     48     {
     49       // The mask will only contain wether the first value is zero, all
     50       // other comparison will fail as something shifted by 4 (above << 4)
     51       // can never be equal to one. To increment in the non-zero case, we
     52       // add the mask and one for the first element:
     53       //   - if zero, mask = -1, v = v - 1 + 1 = v
     54       //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
     55       __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
     56       in0 = _mm_add_epi16(in0, mask);
     57       in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
     58     }
     59   }
     60   // Do the two transform/transpose passes
     61   for (pass = 0; pass < 2; ++pass) {
     62     // Transform 1/2: Add/substract
     63     const __m128i r0 = _mm_add_epi16(in0, in3);
     64     const __m128i r1 = _mm_add_epi16(in1, in2);
     65     const __m128i r2 = _mm_sub_epi16(in1, in2);
     66     const __m128i r3 = _mm_sub_epi16(in0, in3);
     67     // Transform 1/2: Interleave to do the multiply by constants which gets us
     68     //                into 32 bits.
     69     const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
     70     const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
     71     const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
     72     const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
     73     const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
     74     const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
     75     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     76     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     77     const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     78     const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     79     const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     80     const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     81     const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     82     const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     83     // Combine and transpose
     84     const __m128i res0 = _mm_packs_epi32(w0, w2);
     85     const __m128i res1 = _mm_packs_epi32(w4, w6);
     86     // 00 01 02 03 20 21 22 23
     87     // 10 11 12 13 30 31 32 33
     88     const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
     89     const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
     90     // 00 10 01 11 02 12 03 13
     91     // 20 30 21 31 22 32 23 33
     92     in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     93     in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     94     // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
     95     // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
     96     if (0 == pass) {
     97       // Extract values in the high part for second pass as transform code
     98       // only uses the first four values.
     99       in1 = _mm_unpackhi_epi64(in0, in0);
    100       in3 = _mm_unpackhi_epi64(in2, in2);
    101     } else {
    102       // Post-condition output and store it (v + 1) >> 2, taking advantage
    103       // of the fact 1/3 are stored just after 0/2.
    104       __m128i out01 = _mm_add_epi16(in0, kOne);
    105       __m128i out23 = _mm_add_epi16(in2, kOne);
    106       out01 = _mm_srai_epi16(out01, 2);
    107       out23 = _mm_srai_epi16(out23, 2);
    108       _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
    109       _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
    110     }
    111   }
    112 }
    113 
    114 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
    115                                    int stride) {
    116   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
    117   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
    118   __m128i mask;
    119 
    120   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
    121   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
    122   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
    123   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
    124 
    125   in[0] = _mm_slli_epi16(in[0], 4);
    126   in[1] = _mm_slli_epi16(in[1], 4);
    127   in[2] = _mm_slli_epi16(in[2], 4);
    128   in[3] = _mm_slli_epi16(in[3], 4);
    129 
    130   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
    131   in[0] = _mm_add_epi16(in[0], mask);
    132   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
    133 }
    134 
    135 static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
    136   const __m128i kOne = _mm_set1_epi16(1);
    137   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
    138   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
    139   __m128i out01 = _mm_add_epi16(in01, kOne);
    140   __m128i out23 = _mm_add_epi16(in23, kOne);
    141   out01 = _mm_srai_epi16(out01, 2);
    142   out23 = _mm_srai_epi16(out23, 2);
    143   _mm_store_si128((__m128i *)(output + 0 * 8), out01);
    144   _mm_store_si128((__m128i *)(output + 1 * 8), out23);
    145 }
    146 
    147 static INLINE void transpose_4x4(__m128i *res) {
    148   // Combine and transpose
    149   // 00 01 02 03 20 21 22 23
    150   // 10 11 12 13 30 31 32 33
    151   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
    152   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
    153 
    154   // 00 10 01 11 02 12 03 13
    155   // 20 30 21 31 22 32 23 33
    156   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
    157   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
    158 
    159   // 00 10 20 30 01 11 21 31
    160   // 02 12 22 32 03 13 23 33
    161   // only use the first 4 16-bit integers
    162   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
    163   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
    164 }
    165 
    166 void fdct4_1d_sse2(__m128i *in) {
    167   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    168   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    169   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    170   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    171   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    172 
    173   __m128i u[4], v[4];
    174   u[0]=_mm_unpacklo_epi16(in[0], in[1]);
    175   u[1]=_mm_unpacklo_epi16(in[3], in[2]);
    176 
    177   v[0] = _mm_add_epi16(u[0], u[1]);
    178   v[1] = _mm_sub_epi16(u[0], u[1]);
    179 
    180   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
    181   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
    182   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
    183   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
    184 
    185   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    186   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    187   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    188   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    189   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    190   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    191   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    192   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    193 
    194   in[0] = _mm_packs_epi32(u[0], u[1]);
    195   in[1] = _mm_packs_epi32(u[2], u[3]);
    196   transpose_4x4(in);
    197 }
    198 
    199 void fadst4_1d_sse2(__m128i *in) {
    200   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
    201   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
    202   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
    203   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
    204   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
    205   const __m128i kZero = _mm_set1_epi16(0);
    206   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    207   __m128i u[8], v[8];
    208   __m128i in7 = _mm_add_epi16(in[0], in[1]);
    209 
    210   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
    211   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
    212   u[2] = _mm_unpacklo_epi16(in7, kZero);
    213   u[3] = _mm_unpacklo_epi16(in[2], kZero);
    214   u[4] = _mm_unpacklo_epi16(in[3], kZero);
    215 
    216   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
    217   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
    218   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
    219   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
    220   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
    221   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
    222   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
    223 
    224   u[0] = _mm_add_epi32(v[0], v[1]);
    225   u[1] = _mm_sub_epi32(v[2], v[6]);
    226   u[2] = _mm_add_epi32(v[3], v[4]);
    227   u[3] = _mm_sub_epi32(u[2], u[0]);
    228   u[4] = _mm_slli_epi32(v[5], 2);
    229   u[5] = _mm_sub_epi32(u[4], v[5]);
    230   u[6] = _mm_add_epi32(u[3], u[5]);
    231 
    232   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    233   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    234   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    235   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    236 
    237   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    238   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    239   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    240   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    241 
    242   in[0] = _mm_packs_epi32(u[0], u[2]);
    243   in[1] = _mm_packs_epi32(u[1], u[3]);
    244   transpose_4x4(in);
    245 }
    246 
    247 void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
    248                            int stride, int tx_type) {
    249   __m128i in[4];
    250   load_buffer_4x4(input, in, stride);
    251   switch (tx_type) {
    252     case 0:  // DCT_DCT
    253       fdct4_1d_sse2(in);
    254       fdct4_1d_sse2(in);
    255       break;
    256     case 1:  // ADST_DCT
    257       fadst4_1d_sse2(in);
    258       fdct4_1d_sse2(in);
    259       break;
    260     case 2:  // DCT_ADST
    261       fdct4_1d_sse2(in);
    262       fadst4_1d_sse2(in);
    263       break;
    264     case 3:  // ADST_ADST
    265       fadst4_1d_sse2(in);
    266       fadst4_1d_sse2(in);
    267       break;
    268     default:
    269       assert(0);
    270       break;
    271   }
    272   write_buffer_4x4(output, in);
    273 }
    274 
    275 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
    276   int pass;
    277   // Constants
    278   //    When we use them, in one case, they are all the same. In all others
    279   //    it's a pair of them that we need to repeat four times. This is done
    280   //    by constructing the 32 bit constant corresponding to that pair.
    281   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    282   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    283   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    284   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    285   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
    286   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    287   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
    288   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    289   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    290   // Load input
    291   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
    292   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
    293   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
    294   __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
    295   __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
    296   __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
    297   __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
    298   __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
    299   // Pre-condition input (shift by two)
    300   in0 = _mm_slli_epi16(in0, 2);
    301   in1 = _mm_slli_epi16(in1, 2);
    302   in2 = _mm_slli_epi16(in2, 2);
    303   in3 = _mm_slli_epi16(in3, 2);
    304   in4 = _mm_slli_epi16(in4, 2);
    305   in5 = _mm_slli_epi16(in5, 2);
    306   in6 = _mm_slli_epi16(in6, 2);
    307   in7 = _mm_slli_epi16(in7, 2);
    308 
    309   // We do two passes, first the columns, then the rows. The results of the
    310   // first pass are transposed so that the same column code can be reused. The
    311   // results of the second pass are also transposed so that the rows (processed
    312   // as columns) are put back in row positions.
    313   for (pass = 0; pass < 2; pass++) {
    314     // To store results of each pass before the transpose.
    315     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
    316     // Add/substract
    317     const __m128i q0 = _mm_add_epi16(in0, in7);
    318     const __m128i q1 = _mm_add_epi16(in1, in6);
    319     const __m128i q2 = _mm_add_epi16(in2, in5);
    320     const __m128i q3 = _mm_add_epi16(in3, in4);
    321     const __m128i q4 = _mm_sub_epi16(in3, in4);
    322     const __m128i q5 = _mm_sub_epi16(in2, in5);
    323     const __m128i q6 = _mm_sub_epi16(in1, in6);
    324     const __m128i q7 = _mm_sub_epi16(in0, in7);
    325     // Work on first four results
    326     {
    327       // Add/substract
    328       const __m128i r0 = _mm_add_epi16(q0, q3);
    329       const __m128i r1 = _mm_add_epi16(q1, q2);
    330       const __m128i r2 = _mm_sub_epi16(q1, q2);
    331       const __m128i r3 = _mm_sub_epi16(q0, q3);
    332       // Interleave to do the multiply by constants which gets us into 32bits
    333       const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
    334       const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
    335       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
    336       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
    337       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
    338       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
    339       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
    340       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
    341       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
    342       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
    343       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
    344       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
    345       // dct_const_round_shift
    346       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    347       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    348       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    349       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    350       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    351       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
    352       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    353       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
    354       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    355       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    356       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    357       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    358       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    359       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    360       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    361       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    362       // Combine
    363       res0 = _mm_packs_epi32(w0, w1);
    364       res4 = _mm_packs_epi32(w2, w3);
    365       res2 = _mm_packs_epi32(w4, w5);
    366       res6 = _mm_packs_epi32(w6, w7);
    367     }
    368     // Work on next four results
    369     {
    370       // Interleave to do the multiply by constants which gets us into 32bits
    371       const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
    372       const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
    373       const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
    374       const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
    375       const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
    376       const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
    377       // dct_const_round_shift
    378       const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
    379       const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
    380       const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
    381       const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
    382       const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
    383       const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
    384       const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
    385       const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
    386       // Combine
    387       const __m128i r0 = _mm_packs_epi32(s0, s1);
    388       const __m128i r1 = _mm_packs_epi32(s2, s3);
    389       // Add/substract
    390       const __m128i x0 = _mm_add_epi16(q4, r0);
    391       const __m128i x1 = _mm_sub_epi16(q4, r0);
    392       const __m128i x2 = _mm_sub_epi16(q7, r1);
    393       const __m128i x3 = _mm_add_epi16(q7, r1);
    394       // Interleave to do the multiply by constants which gets us into 32bits
    395       const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
    396       const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
    397       const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
    398       const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
    399       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
    400       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
    401       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
    402       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
    403       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
    404       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
    405       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
    406       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
    407       // dct_const_round_shift
    408       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    409       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    410       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    411       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    412       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    413       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
    414       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    415       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
    416       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    417       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    418       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    419       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    420       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    421       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    422       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    423       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    424       // Combine
    425       res1 = _mm_packs_epi32(w0, w1);
    426       res7 = _mm_packs_epi32(w2, w3);
    427       res5 = _mm_packs_epi32(w4, w5);
    428       res3 = _mm_packs_epi32(w6, w7);
    429     }
    430     // Transpose the 8x8.
    431     {
    432       // 00 01 02 03 04 05 06 07
    433       // 10 11 12 13 14 15 16 17
    434       // 20 21 22 23 24 25 26 27
    435       // 30 31 32 33 34 35 36 37
    436       // 40 41 42 43 44 45 46 47
    437       // 50 51 52 53 54 55 56 57
    438       // 60 61 62 63 64 65 66 67
    439       // 70 71 72 73 74 75 76 77
    440       const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
    441       const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
    442       const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
    443       const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
    444       const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
    445       const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
    446       const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
    447       const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
    448       // 00 10 01 11 02 12 03 13
    449       // 20 30 21 31 22 32 23 33
    450       // 04 14 05 15 06 16 07 17
    451       // 24 34 25 35 26 36 27 37
    452       // 40 50 41 51 42 52 43 53
    453       // 60 70 61 71 62 72 63 73
    454       // 54 54 55 55 56 56 57 57
    455       // 64 74 65 75 66 76 67 77
    456       const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    457       const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
    458       const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    459       const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
    460       const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
    461       const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
    462       const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
    463       const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
    464       // 00 10 20 30 01 11 21 31
    465       // 40 50 60 70 41 51 61 71
    466       // 02 12 22 32 03 13 23 33
    467       // 42 52 62 72 43 53 63 73
    468       // 04 14 24 34 05 15 21 36
    469       // 44 54 64 74 45 55 61 76
    470       // 06 16 26 36 07 17 27 37
    471       // 46 56 66 76 47 57 67 77
    472       in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
    473       in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
    474       in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
    475       in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
    476       in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
    477       in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
    478       in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
    479       in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
    480       // 00 10 20 30 40 50 60 70
    481       // 01 11 21 31 41 51 61 71
    482       // 02 12 22 32 42 52 62 72
    483       // 03 13 23 33 43 53 63 73
    484       // 04 14 24 34 44 54 64 74
    485       // 05 15 25 35 45 55 65 75
    486       // 06 16 26 36 46 56 66 76
    487       // 07 17 27 37 47 57 67 77
    488     }
    489   }
    490   // Post-condition output and store it
    491   {
    492     // Post-condition (division by two)
    493     //    division of two 16 bits signed numbers using shifts
    494     //    n / 2 = (n - (n >> 15)) >> 1
    495     const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
    496     const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
    497     const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
    498     const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
    499     const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
    500     const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
    501     const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
    502     const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
    503     in0 = _mm_sub_epi16(in0, sign_in0);
    504     in1 = _mm_sub_epi16(in1, sign_in1);
    505     in2 = _mm_sub_epi16(in2, sign_in2);
    506     in3 = _mm_sub_epi16(in3, sign_in3);
    507     in4 = _mm_sub_epi16(in4, sign_in4);
    508     in5 = _mm_sub_epi16(in5, sign_in5);
    509     in6 = _mm_sub_epi16(in6, sign_in6);
    510     in7 = _mm_sub_epi16(in7, sign_in7);
    511     in0 = _mm_srai_epi16(in0, 1);
    512     in1 = _mm_srai_epi16(in1, 1);
    513     in2 = _mm_srai_epi16(in2, 1);
    514     in3 = _mm_srai_epi16(in3, 1);
    515     in4 = _mm_srai_epi16(in4, 1);
    516     in5 = _mm_srai_epi16(in5, 1);
    517     in6 = _mm_srai_epi16(in6, 1);
    518     in7 = _mm_srai_epi16(in7, 1);
    519     // store results
    520     _mm_store_si128((__m128i *)(output + 0 * 8), in0);
    521     _mm_store_si128((__m128i *)(output + 1 * 8), in1);
    522     _mm_store_si128((__m128i *)(output + 2 * 8), in2);
    523     _mm_store_si128((__m128i *)(output + 3 * 8), in3);
    524     _mm_store_si128((__m128i *)(output + 4 * 8), in4);
    525     _mm_store_si128((__m128i *)(output + 5 * 8), in5);
    526     _mm_store_si128((__m128i *)(output + 6 * 8), in6);
    527     _mm_store_si128((__m128i *)(output + 7 * 8), in7);
    528   }
    529 }
    530 
    531 // load 8x8 array
    532 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
    533                                    int stride) {
    534   in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
    535   in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
    536   in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
    537   in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
    538   in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
    539   in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
    540   in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
    541   in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
    542 
    543   in[0] = _mm_slli_epi16(in[0], 2);
    544   in[1] = _mm_slli_epi16(in[1], 2);
    545   in[2] = _mm_slli_epi16(in[2], 2);
    546   in[3] = _mm_slli_epi16(in[3], 2);
    547   in[4] = _mm_slli_epi16(in[4], 2);
    548   in[5] = _mm_slli_epi16(in[5], 2);
    549   in[6] = _mm_slli_epi16(in[6], 2);
    550   in[7] = _mm_slli_epi16(in[7], 2);
    551 }
    552 
    553 // right shift and rounding
    554 static INLINE void right_shift_8x8(__m128i *res, int const bit) {
    555   const __m128i kOne = _mm_set1_epi16(1);
    556   const int bit_m02 = bit - 2;
    557   __m128i sign0 = _mm_srai_epi16(res[0], 15);
    558   __m128i sign1 = _mm_srai_epi16(res[1], 15);
    559   __m128i sign2 = _mm_srai_epi16(res[2], 15);
    560   __m128i sign3 = _mm_srai_epi16(res[3], 15);
    561   __m128i sign4 = _mm_srai_epi16(res[4], 15);
    562   __m128i sign5 = _mm_srai_epi16(res[5], 15);
    563   __m128i sign6 = _mm_srai_epi16(res[6], 15);
    564   __m128i sign7 = _mm_srai_epi16(res[7], 15);
    565 
    566   if (bit_m02 >= 0) {
    567     __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
    568     res[0] = _mm_add_epi16(res[0], k_const_rounding);
    569     res[1] = _mm_add_epi16(res[1], k_const_rounding);
    570     res[2] = _mm_add_epi16(res[2], k_const_rounding);
    571     res[3] = _mm_add_epi16(res[3], k_const_rounding);
    572     res[4] = _mm_add_epi16(res[4], k_const_rounding);
    573     res[5] = _mm_add_epi16(res[5], k_const_rounding);
    574     res[6] = _mm_add_epi16(res[6], k_const_rounding);
    575     res[7] = _mm_add_epi16(res[7], k_const_rounding);
    576   }
    577 
    578   res[0] = _mm_sub_epi16(res[0], sign0);
    579   res[1] = _mm_sub_epi16(res[1], sign1);
    580   res[2] = _mm_sub_epi16(res[2], sign2);
    581   res[3] = _mm_sub_epi16(res[3], sign3);
    582   res[4] = _mm_sub_epi16(res[4], sign4);
    583   res[5] = _mm_sub_epi16(res[5], sign5);
    584   res[6] = _mm_sub_epi16(res[6], sign6);
    585   res[7] = _mm_sub_epi16(res[7], sign7);
    586 
    587   res[0] = _mm_srai_epi16(res[0], bit);
    588   res[1] = _mm_srai_epi16(res[1], bit);
    589   res[2] = _mm_srai_epi16(res[2], bit);
    590   res[3] = _mm_srai_epi16(res[3], bit);
    591   res[4] = _mm_srai_epi16(res[4], bit);
    592   res[5] = _mm_srai_epi16(res[5], bit);
    593   res[6] = _mm_srai_epi16(res[6], bit);
    594   res[7] = _mm_srai_epi16(res[7], bit);
    595 }
    596 
    597 // write 8x8 array
    598 static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
    599   _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
    600   _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
    601   _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
    602   _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
    603   _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
    604   _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
    605   _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
    606   _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
    607 }
    608 
    609 // perform in-place transpose
    610 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
    611   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
    612   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
    613   const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
    614   const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
    615   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
    616   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
    617   const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
    618   const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
    619   // 00 10 01 11 02 12 03 13
    620   // 20 30 21 31 22 32 23 33
    621   // 04 14 05 15 06 16 07 17
    622   // 24 34 25 35 26 36 27 37
    623   // 40 50 41 51 42 52 43 53
    624   // 60 70 61 71 62 72 63 73
    625   // 44 54 45 55 46 56 47 57
    626   // 64 74 65 75 66 76 67 77
    627   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    628   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
    629   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    630   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
    631   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
    632   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
    633   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
    634   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
    635   // 00 10 20 30 01 11 21 31
    636   // 40 50 60 70 41 51 61 71
    637   // 02 12 22 32 03 13 23 33
    638   // 42 52 62 72 43 53 63 73
    639   // 04 14 24 34 05 15 25 35
    640   // 44 54 64 74 45 55 65 75
    641   // 06 16 26 36 07 17 27 37
    642   // 46 56 66 76 47 57 67 77
    643   res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
    644   res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
    645   res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
    646   res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
    647   res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
    648   res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
    649   res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
    650   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
    651   // 00 10 20 30 40 50 60 70
    652   // 01 11 21 31 41 51 61 71
    653   // 02 12 22 32 42 52 62 72
    654   // 03 13 23 33 43 53 63 73
    655   // 04 14 24 34 44 54 64 74
    656   // 05 15 25 35 45 55 65 75
    657   // 06 16 26 36 46 56 66 76
    658   // 07 17 27 37 47 57 67 77
    659 }
    660 
    661 void fdct8_1d_sse2(__m128i *in) {
    662   // constants
    663   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    664   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    665   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    666   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    667   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
    668   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    669   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
    670   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    671   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    672   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
    673   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
    674   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    675 
    676   // stage 1
    677   s0 = _mm_add_epi16(in[0], in[7]);
    678   s1 = _mm_add_epi16(in[1], in[6]);
    679   s2 = _mm_add_epi16(in[2], in[5]);
    680   s3 = _mm_add_epi16(in[3], in[4]);
    681   s4 = _mm_sub_epi16(in[3], in[4]);
    682   s5 = _mm_sub_epi16(in[2], in[5]);
    683   s6 = _mm_sub_epi16(in[1], in[6]);
    684   s7 = _mm_sub_epi16(in[0], in[7]);
    685 
    686   u0 = _mm_add_epi16(s0, s3);
    687   u1 = _mm_add_epi16(s1, s2);
    688   u2 = _mm_sub_epi16(s1, s2);
    689   u3 = _mm_sub_epi16(s0, s3);
    690   // interleave and perform butterfly multiplication/addition
    691   v0 = _mm_unpacklo_epi16(u0, u1);
    692   v1 = _mm_unpackhi_epi16(u0, u1);
    693   v2 = _mm_unpacklo_epi16(u2, u3);
    694   v3 = _mm_unpackhi_epi16(u2, u3);
    695 
    696   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
    697   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
    698   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
    699   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
    700   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
    701   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
    702   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
    703   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
    704 
    705   // shift and rounding
    706   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    707   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    708   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    709   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    710   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    711   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
    712   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    713   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
    714 
    715   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    716   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    717   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    718   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    719   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    720   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    721   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    722   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    723 
    724   in[0] = _mm_packs_epi32(u0, u1);
    725   in[2] = _mm_packs_epi32(u4, u5);
    726   in[4] = _mm_packs_epi32(u2, u3);
    727   in[6] = _mm_packs_epi32(u6, u7);
    728 
    729   // stage 2
    730   // interleave and perform butterfly multiplication/addition
    731   u0 = _mm_unpacklo_epi16(s6, s5);
    732   u1 = _mm_unpackhi_epi16(s6, s5);
    733   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    734   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    735   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    736   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    737 
    738   // shift and rounding
    739   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    740   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    741   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    742   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    743 
    744   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    745   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    746   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    747   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    748 
    749   u0 = _mm_packs_epi32(v0, v1);
    750   u1 = _mm_packs_epi32(v2, v3);
    751 
    752   // stage 3
    753   s0 = _mm_add_epi16(s4, u0);
    754   s1 = _mm_sub_epi16(s4, u0);
    755   s2 = _mm_sub_epi16(s7, u1);
    756   s3 = _mm_add_epi16(s7, u1);
    757 
    758   // stage 4
    759   u0 = _mm_unpacklo_epi16(s0, s3);
    760   u1 = _mm_unpackhi_epi16(s0, s3);
    761   u2 = _mm_unpacklo_epi16(s1, s2);
    762   u3 = _mm_unpackhi_epi16(s1, s2);
    763 
    764   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
    765   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
    766   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
    767   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
    768   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
    769   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
    770   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
    771   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
    772 
    773   // shift and rounding
    774   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    775   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    776   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    777   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    778   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    779   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    780   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    781   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    782 
    783   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    784   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    785   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    786   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    787   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    788   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    789   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    790   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    791 
    792   in[1] = _mm_packs_epi32(v0, v1);
    793   in[3] = _mm_packs_epi32(v4, v5);
    794   in[5] = _mm_packs_epi32(v2, v3);
    795   in[7] = _mm_packs_epi32(v6, v7);
    796 
    797   // transpose
    798   array_transpose_8x8(in, in);
    799 }
    800 
    801 void fadst8_1d_sse2(__m128i *in) {
    802   // Constants
    803   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
    804   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    805   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
    806   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    807   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
    808   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    809   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
    810   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    811   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    812   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    813   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    814   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    815   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    816   const __m128i k__const_0 = _mm_set1_epi16(0);
    817   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    818 
    819   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
    820   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    821   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
    822   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    823   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    824 
    825   // properly aligned for butterfly input
    826   in0  = in[7];
    827   in1  = in[0];
    828   in2  = in[5];
    829   in3  = in[2];
    830   in4  = in[3];
    831   in5  = in[4];
    832   in6  = in[1];
    833   in7  = in[6];
    834 
    835   // column transformation
    836   // stage 1
    837   // interleave and multiply/add into 32-bit integer
    838   s0 = _mm_unpacklo_epi16(in0, in1);
    839   s1 = _mm_unpackhi_epi16(in0, in1);
    840   s2 = _mm_unpacklo_epi16(in2, in3);
    841   s3 = _mm_unpackhi_epi16(in2, in3);
    842   s4 = _mm_unpacklo_epi16(in4, in5);
    843   s5 = _mm_unpackhi_epi16(in4, in5);
    844   s6 = _mm_unpacklo_epi16(in6, in7);
    845   s7 = _mm_unpackhi_epi16(in6, in7);
    846 
    847   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
    848   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
    849   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
    850   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
    851   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
    852   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
    853   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
    854   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
    855   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
    856   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
    857   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
    858   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
    859   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
    860   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
    861   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
    862   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
    863 
    864   // addition
    865   w0 = _mm_add_epi32(u0, u8);
    866   w1 = _mm_add_epi32(u1, u9);
    867   w2 = _mm_add_epi32(u2, u10);
    868   w3 = _mm_add_epi32(u3, u11);
    869   w4 = _mm_add_epi32(u4, u12);
    870   w5 = _mm_add_epi32(u5, u13);
    871   w6 = _mm_add_epi32(u6, u14);
    872   w7 = _mm_add_epi32(u7, u15);
    873   w8 = _mm_sub_epi32(u0, u8);
    874   w9 = _mm_sub_epi32(u1, u9);
    875   w10 = _mm_sub_epi32(u2, u10);
    876   w11 = _mm_sub_epi32(u3, u11);
    877   w12 = _mm_sub_epi32(u4, u12);
    878   w13 = _mm_sub_epi32(u5, u13);
    879   w14 = _mm_sub_epi32(u6, u14);
    880   w15 = _mm_sub_epi32(u7, u15);
    881 
    882   // shift and rounding
    883   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    884   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    885   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    886   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    887   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    888   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    889   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    890   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    891   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
    892   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
    893   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
    894   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
    895   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
    896   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
    897   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
    898   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
    899 
    900   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    901   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    902   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    903   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    904   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    905   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    906   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    907   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    908   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
    909   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
    910   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
    911   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
    912   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
    913   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
    914   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
    915   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
    916 
    917   // back to 16-bit and pack 8 integers into __m128i
    918   in[0] = _mm_packs_epi32(u0, u1);
    919   in[1] = _mm_packs_epi32(u2, u3);
    920   in[2] = _mm_packs_epi32(u4, u5);
    921   in[3] = _mm_packs_epi32(u6, u7);
    922   in[4] = _mm_packs_epi32(u8, u9);
    923   in[5] = _mm_packs_epi32(u10, u11);
    924   in[6] = _mm_packs_epi32(u12, u13);
    925   in[7] = _mm_packs_epi32(u14, u15);
    926 
    927   // stage 2
    928   s0 = _mm_add_epi16(in[0], in[2]);
    929   s1 = _mm_add_epi16(in[1], in[3]);
    930   s2 = _mm_sub_epi16(in[0], in[2]);
    931   s3 = _mm_sub_epi16(in[1], in[3]);
    932   u0 = _mm_unpacklo_epi16(in[4], in[5]);
    933   u1 = _mm_unpackhi_epi16(in[4], in[5]);
    934   u2 = _mm_unpacklo_epi16(in[6], in[7]);
    935   u3 = _mm_unpackhi_epi16(in[6], in[7]);
    936 
    937   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
    938   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
    939   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
    940   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
    941   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
    942   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
    943   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
    944   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
    945 
    946   w0 = _mm_add_epi32(v0, v4);
    947   w1 = _mm_add_epi32(v1, v5);
    948   w2 = _mm_add_epi32(v2, v6);
    949   w3 = _mm_add_epi32(v3, v7);
    950   w4 = _mm_sub_epi32(v0, v4);
    951   w5 = _mm_sub_epi32(v1, v5);
    952   w6 = _mm_sub_epi32(v2, v6);
    953   w7 = _mm_sub_epi32(v3, v7);
    954 
    955   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    956   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    957   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    958   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    959   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    960   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    961   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    962   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    963 
    964   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    965   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    966   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    967   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    968   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    969   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    970   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    971   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    972 
    973   // back to 16-bit intergers
    974   s4 = _mm_packs_epi32(u0, u1);
    975   s5 = _mm_packs_epi32(u2, u3);
    976   s6 = _mm_packs_epi32(u4, u5);
    977   s7 = _mm_packs_epi32(u6, u7);
    978 
    979   // stage 3
    980   u0 = _mm_unpacklo_epi16(s2, s3);
    981   u1 = _mm_unpackhi_epi16(s2, s3);
    982   u2 = _mm_unpacklo_epi16(s6, s7);
    983   u3 = _mm_unpackhi_epi16(s6, s7);
    984 
    985   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    986   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    987   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    988   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    989   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
    990   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
    991   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
    992   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
    993 
    994   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    995   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    996   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    997   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    998   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    999   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
   1000   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
   1001   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
   1002 
   1003   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
   1004   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
   1005   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
   1006   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
   1007   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
   1008   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
   1009   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
   1010   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
   1011 
   1012   s2 = _mm_packs_epi32(v0, v1);
   1013   s3 = _mm_packs_epi32(v2, v3);
   1014   s6 = _mm_packs_epi32(v4, v5);
   1015   s7 = _mm_packs_epi32(v6, v7);
   1016 
   1017   // FIXME(jingning): do subtract using bit inversion?
   1018   in[0] = s0;
   1019   in[1] = _mm_sub_epi16(k__const_0, s4);
   1020   in[2] = s6;
   1021   in[3] = _mm_sub_epi16(k__const_0, s2);
   1022   in[4] = s3;
   1023   in[5] = _mm_sub_epi16(k__const_0, s7);
   1024   in[6] = s5;
   1025   in[7] = _mm_sub_epi16(k__const_0, s1);
   1026 
   1027   // transpose
   1028   array_transpose_8x8(in, in);
   1029 }
   1030 
   1031 void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
   1032                            int stride, int tx_type) {
   1033   __m128i in[8];
   1034   load_buffer_8x8(input, in, stride);
   1035   switch (tx_type) {
   1036     case 0:  // DCT_DCT
   1037       fdct8_1d_sse2(in);
   1038       fdct8_1d_sse2(in);
   1039       break;
   1040     case 1:  // ADST_DCT
   1041       fadst8_1d_sse2(in);
   1042       fdct8_1d_sse2(in);
   1043       break;
   1044     case 2:  // DCT_ADST
   1045       fdct8_1d_sse2(in);
   1046       fadst8_1d_sse2(in);
   1047       break;
   1048     case 3:  // ADST_ADST
   1049       fadst8_1d_sse2(in);
   1050       fadst8_1d_sse2(in);
   1051       break;
   1052     default:
   1053       assert(0);
   1054       break;
   1055   }
   1056   right_shift_8x8(in, 1);
   1057   write_buffer_8x8(output, in, 8);
   1058 }
   1059 
   1060 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   1061   // The 2D transform is done with two passes which are actually pretty
   1062   // similar. In the first one, we transform the columns and transpose
   1063   // the results. In the second one, we transform the rows. To achieve that,
   1064   // as the first pass results are transposed, we tranpose the columns (that
   1065   // is the transposed rows) and transpose the results (so that it goes back
   1066   // in normal/row positions).
   1067   int pass;
   1068   // We need an intermediate buffer between passes.
   1069   DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
   1070   const int16_t *in = input;
   1071   int16_t *out = intermediate;
   1072   // Constants
   1073   //    When we use them, in one case, they are all the same. In all others
   1074   //    it's a pair of them that we need to repeat four times. This is done
   1075   //    by constructing the 32 bit constant corresponding to that pair.
   1076   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   1077   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1078   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1079   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   1080   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1081   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   1082   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   1083   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
   1084   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   1085   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
   1086   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
   1087   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
   1088   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
   1089   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
   1090   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
   1091   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
   1092   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
   1093   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1094   const __m128i kOne = _mm_set1_epi16(1);
   1095   // Do the two transform/transpose passes
   1096   for (pass = 0; pass < 2; ++pass) {
   1097     // We process eight columns (transposed rows in second pass) at a time.
   1098     int column_start;
   1099     for (column_start = 0; column_start < 16; column_start += 8) {
   1100       __m128i in00, in01, in02, in03, in04, in05, in06, in07;
   1101       __m128i in08, in09, in10, in11, in12, in13, in14, in15;
   1102       __m128i input0, input1, input2, input3, input4, input5, input6, input7;
   1103       __m128i step1_0, step1_1, step1_2, step1_3;
   1104       __m128i step1_4, step1_5, step1_6, step1_7;
   1105       __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
   1106       __m128i step3_0, step3_1, step3_2, step3_3;
   1107       __m128i step3_4, step3_5, step3_6, step3_7;
   1108       __m128i res00, res01, res02, res03, res04, res05, res06, res07;
   1109       __m128i res08, res09, res10, res11, res12, res13, res14, res15;
   1110       // Load and pre-condition input.
   1111       if (0 == pass) {
   1112         in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
   1113         in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
   1114         in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
   1115         in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
   1116         in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
   1117         in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
   1118         in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
   1119         in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
   1120         in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
   1121         in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
   1122         in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
   1123         in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
   1124         in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
   1125         in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
   1126         in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
   1127         in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
   1128         // x = x << 2
   1129         in00 = _mm_slli_epi16(in00, 2);
   1130         in01 = _mm_slli_epi16(in01, 2);
   1131         in02 = _mm_slli_epi16(in02, 2);
   1132         in03 = _mm_slli_epi16(in03, 2);
   1133         in04 = _mm_slli_epi16(in04, 2);
   1134         in05 = _mm_slli_epi16(in05, 2);
   1135         in06 = _mm_slli_epi16(in06, 2);
   1136         in07 = _mm_slli_epi16(in07, 2);
   1137         in08 = _mm_slli_epi16(in08, 2);
   1138         in09 = _mm_slli_epi16(in09, 2);
   1139         in10 = _mm_slli_epi16(in10, 2);
   1140         in11 = _mm_slli_epi16(in11, 2);
   1141         in12 = _mm_slli_epi16(in12, 2);
   1142         in13 = _mm_slli_epi16(in13, 2);
   1143         in14 = _mm_slli_epi16(in14, 2);
   1144         in15 = _mm_slli_epi16(in15, 2);
   1145       } else {
   1146         in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
   1147         in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
   1148         in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
   1149         in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
   1150         in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
   1151         in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
   1152         in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
   1153         in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
   1154         in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
   1155         in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
   1156         in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
   1157         in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
   1158         in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
   1159         in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
   1160         in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
   1161         in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
   1162         // x = (x + 1) >> 2
   1163         in00 = _mm_add_epi16(in00, kOne);
   1164         in01 = _mm_add_epi16(in01, kOne);
   1165         in02 = _mm_add_epi16(in02, kOne);
   1166         in03 = _mm_add_epi16(in03, kOne);
   1167         in04 = _mm_add_epi16(in04, kOne);
   1168         in05 = _mm_add_epi16(in05, kOne);
   1169         in06 = _mm_add_epi16(in06, kOne);
   1170         in07 = _mm_add_epi16(in07, kOne);
   1171         in08 = _mm_add_epi16(in08, kOne);
   1172         in09 = _mm_add_epi16(in09, kOne);
   1173         in10 = _mm_add_epi16(in10, kOne);
   1174         in11 = _mm_add_epi16(in11, kOne);
   1175         in12 = _mm_add_epi16(in12, kOne);
   1176         in13 = _mm_add_epi16(in13, kOne);
   1177         in14 = _mm_add_epi16(in14, kOne);
   1178         in15 = _mm_add_epi16(in15, kOne);
   1179         in00 = _mm_srai_epi16(in00, 2);
   1180         in01 = _mm_srai_epi16(in01, 2);
   1181         in02 = _mm_srai_epi16(in02, 2);
   1182         in03 = _mm_srai_epi16(in03, 2);
   1183         in04 = _mm_srai_epi16(in04, 2);
   1184         in05 = _mm_srai_epi16(in05, 2);
   1185         in06 = _mm_srai_epi16(in06, 2);
   1186         in07 = _mm_srai_epi16(in07, 2);
   1187         in08 = _mm_srai_epi16(in08, 2);
   1188         in09 = _mm_srai_epi16(in09, 2);
   1189         in10 = _mm_srai_epi16(in10, 2);
   1190         in11 = _mm_srai_epi16(in11, 2);
   1191         in12 = _mm_srai_epi16(in12, 2);
   1192         in13 = _mm_srai_epi16(in13, 2);
   1193         in14 = _mm_srai_epi16(in14, 2);
   1194         in15 = _mm_srai_epi16(in15, 2);
   1195       }
   1196       in += 8;
   1197       // Calculate input for the first 8 results.
   1198       {
   1199         input0 = _mm_add_epi16(in00, in15);
   1200         input1 = _mm_add_epi16(in01, in14);
   1201         input2 = _mm_add_epi16(in02, in13);
   1202         input3 = _mm_add_epi16(in03, in12);
   1203         input4 = _mm_add_epi16(in04, in11);
   1204         input5 = _mm_add_epi16(in05, in10);
   1205         input6 = _mm_add_epi16(in06, in09);
   1206         input7 = _mm_add_epi16(in07, in08);
   1207       }
   1208       // Calculate input for the next 8 results.
   1209       {
   1210         step1_0 = _mm_sub_epi16(in07, in08);
   1211         step1_1 = _mm_sub_epi16(in06, in09);
   1212         step1_2 = _mm_sub_epi16(in05, in10);
   1213         step1_3 = _mm_sub_epi16(in04, in11);
   1214         step1_4 = _mm_sub_epi16(in03, in12);
   1215         step1_5 = _mm_sub_epi16(in02, in13);
   1216         step1_6 = _mm_sub_epi16(in01, in14);
   1217         step1_7 = _mm_sub_epi16(in00, in15);
   1218       }
   1219       // Work on the first eight values; fdct8_1d(input, even_results);
   1220       {
   1221         // Add/substract
   1222         const __m128i q0 = _mm_add_epi16(input0, input7);
   1223         const __m128i q1 = _mm_add_epi16(input1, input6);
   1224         const __m128i q2 = _mm_add_epi16(input2, input5);
   1225         const __m128i q3 = _mm_add_epi16(input3, input4);
   1226         const __m128i q4 = _mm_sub_epi16(input3, input4);
   1227         const __m128i q5 = _mm_sub_epi16(input2, input5);
   1228         const __m128i q6 = _mm_sub_epi16(input1, input6);
   1229         const __m128i q7 = _mm_sub_epi16(input0, input7);
   1230         // Work on first four results
   1231         {
   1232           // Add/substract
   1233           const __m128i r0 = _mm_add_epi16(q0, q3);
   1234           const __m128i r1 = _mm_add_epi16(q1, q2);
   1235           const __m128i r2 = _mm_sub_epi16(q1, q2);
   1236           const __m128i r3 = _mm_sub_epi16(q0, q3);
   1237           // Interleave to do the multiply by constants which gets us
   1238           // into 32 bits.
   1239           const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
   1240           const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
   1241           const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
   1242           const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
   1243           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
   1244           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
   1245           const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
   1246           const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
   1247           const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
   1248           const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
   1249           const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
   1250           const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
   1251           // dct_const_round_shift
   1252           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1253           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1254           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1255           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1256           const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
   1257           const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
   1258           const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
   1259           const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
   1260           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1261           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1262           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1263           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1264           const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
   1265           const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
   1266           const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
   1267           const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   1268           // Combine
   1269           res00 = _mm_packs_epi32(w0, w1);
   1270           res08 = _mm_packs_epi32(w2, w3);
   1271           res04 = _mm_packs_epi32(w4, w5);
   1272           res12 = _mm_packs_epi32(w6, w7);
   1273         }
   1274         // Work on next four results
   1275         {
   1276           // Interleave to do the multiply by constants which gets us
   1277           // into 32 bits.
   1278           const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
   1279           const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
   1280           const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
   1281           const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
   1282           const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
   1283           const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
   1284           // dct_const_round_shift
   1285           const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
   1286           const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
   1287           const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
   1288           const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
   1289           const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
   1290           const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
   1291           const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
   1292           const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
   1293           // Combine
   1294           const __m128i r0 = _mm_packs_epi32(s0, s1);
   1295           const __m128i r1 = _mm_packs_epi32(s2, s3);
   1296           // Add/substract
   1297           const __m128i x0 = _mm_add_epi16(q4, r0);
   1298           const __m128i x1 = _mm_sub_epi16(q4, r0);
   1299           const __m128i x2 = _mm_sub_epi16(q7, r1);
   1300           const __m128i x3 = _mm_add_epi16(q7, r1);
   1301           // Interleave to do the multiply by constants which gets us
   1302           // into 32 bits.
   1303           const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
   1304           const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
   1305           const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
   1306           const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
   1307           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
   1308           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
   1309           const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
   1310           const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
   1311           const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
   1312           const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
   1313           const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
   1314           const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
   1315           // dct_const_round_shift
   1316           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1317           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1318           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1319           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1320           const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
   1321           const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
   1322           const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
   1323           const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
   1324           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1325           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1326           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1327           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1328           const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
   1329           const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
   1330           const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
   1331           const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   1332           // Combine
   1333           res02 = _mm_packs_epi32(w0, w1);
   1334           res14 = _mm_packs_epi32(w2, w3);
   1335           res10 = _mm_packs_epi32(w4, w5);
   1336           res06 = _mm_packs_epi32(w6, w7);
   1337         }
   1338       }
   1339       // Work on the next eight values; step1 -> odd_results
   1340       {
   1341         // step 2
   1342         {
   1343           const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
   1344           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
   1345           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
   1346           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
   1347           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
   1348           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
   1349           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
   1350           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
   1351           // dct_const_round_shift
   1352           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1353           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1354           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1355           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1356           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1357           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1358           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1359           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1360           // Combine
   1361           step2_2 = _mm_packs_epi32(w0, w1);
   1362           step2_3 = _mm_packs_epi32(w2, w3);
   1363         }
   1364         {
   1365           const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
   1366           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
   1367           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
   1368           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
   1369           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
   1370           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
   1371           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
   1372           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
   1373           // dct_const_round_shift
   1374           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1375           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1376           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1377           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1378           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1379           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1380           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1381           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1382           // Combine
   1383           step2_5 = _mm_packs_epi32(w0, w1);
   1384           step2_4 = _mm_packs_epi32(w2, w3);
   1385         }
   1386         // step 3
   1387         {
   1388           step3_0 = _mm_add_epi16(step1_0, step2_3);
   1389           step3_1 = _mm_add_epi16(step1_1, step2_2);
   1390           step3_2 = _mm_sub_epi16(step1_1, step2_2);
   1391           step3_3 = _mm_sub_epi16(step1_0, step2_3);
   1392           step3_4 = _mm_sub_epi16(step1_7, step2_4);
   1393           step3_5 = _mm_sub_epi16(step1_6, step2_5);
   1394           step3_6 = _mm_add_epi16(step1_6, step2_5);
   1395           step3_7 = _mm_add_epi16(step1_7, step2_4);
   1396         }
   1397         // step 4
   1398         {
   1399           const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
   1400           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
   1401           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
   1402           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
   1403           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
   1404           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
   1405           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
   1406           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
   1407           // dct_const_round_shift
   1408           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1409           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1410           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1411           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1412           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1413           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1414           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1415           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1416           // Combine
   1417           step2_1 = _mm_packs_epi32(w0, w1);
   1418           step2_2 = _mm_packs_epi32(w2, w3);
   1419         }
   1420         {
   1421           const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
   1422           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
   1423           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
   1424           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
   1425           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
   1426           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
   1427           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
   1428           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
   1429           // dct_const_round_shift
   1430           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1431           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1432           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1433           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1434           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1435           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1436           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1437           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1438           // Combine
   1439           step2_6 = _mm_packs_epi32(w0, w1);
   1440           step2_5 = _mm_packs_epi32(w2, w3);
   1441         }
   1442         // step 5
   1443         {
   1444           step1_0 = _mm_add_epi16(step3_0, step2_1);
   1445           step1_1 = _mm_sub_epi16(step3_0, step2_1);
   1446           step1_2 = _mm_sub_epi16(step3_3, step2_2);
   1447           step1_3 = _mm_add_epi16(step3_3, step2_2);
   1448           step1_4 = _mm_add_epi16(step3_4, step2_5);
   1449           step1_5 = _mm_sub_epi16(step3_4, step2_5);
   1450           step1_6 = _mm_sub_epi16(step3_7, step2_6);
   1451           step1_7 = _mm_add_epi16(step3_7, step2_6);
   1452         }
   1453         // step 6
   1454         {
   1455           const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
   1456           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
   1457           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
   1458           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
   1459           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
   1460           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
   1461           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
   1462           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
   1463           // dct_const_round_shift
   1464           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1465           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1466           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1467           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1468           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1469           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1470           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1471           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1472           // Combine
   1473           res01 = _mm_packs_epi32(w0, w1);
   1474           res09 = _mm_packs_epi32(w2, w3);
   1475         }
   1476         {
   1477           const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
   1478           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
   1479           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
   1480           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
   1481           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
   1482           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
   1483           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
   1484           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
   1485           // dct_const_round_shift
   1486           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1487           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1488           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1489           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1490           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1491           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1492           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1493           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1494           // Combine
   1495           res05 = _mm_packs_epi32(w0, w1);
   1496           res13 = _mm_packs_epi32(w2, w3);
   1497         }
   1498         {
   1499           const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
   1500           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
   1501           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
   1502           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
   1503           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
   1504           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
   1505           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
   1506           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
   1507           // dct_const_round_shift
   1508           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1509           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1510           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1511           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1512           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1513           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1514           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1515           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1516           // Combine
   1517           res11 = _mm_packs_epi32(w0, w1);
   1518           res03 = _mm_packs_epi32(w2, w3);
   1519         }
   1520         {
   1521           const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
   1522           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
   1523           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
   1524           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
   1525           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
   1526           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
   1527           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
   1528           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
   1529           // dct_const_round_shift
   1530           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1531           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1532           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1533           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1534           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1535           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1536           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1537           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1538           // Combine
   1539           res15 = _mm_packs_epi32(w0, w1);
   1540           res07 = _mm_packs_epi32(w2, w3);
   1541         }
   1542       }
   1543       // Transpose the results, do it as two 8x8 transposes.
   1544       {
   1545         // 00 01 02 03 04 05 06 07
   1546         // 10 11 12 13 14 15 16 17
   1547         // 20 21 22 23 24 25 26 27
   1548         // 30 31 32 33 34 35 36 37
   1549         // 40 41 42 43 44 45 46 47
   1550         // 50 51 52 53 54 55 56 57
   1551         // 60 61 62 63 64 65 66 67
   1552         // 70 71 72 73 74 75 76 77
   1553         const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
   1554         const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
   1555         const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
   1556         const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
   1557         const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
   1558         const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
   1559         const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
   1560         const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
   1561         // 00 10 01 11 02 12 03 13
   1562         // 20 30 21 31 22 32 23 33
   1563         // 04 14 05 15 06 16 07 17
   1564         // 24 34 25 35 26 36 27 37
   1565         // 40 50 41 51 42 52 43 53
   1566         // 60 70 61 71 62 72 63 73
   1567         // 54 54 55 55 56 56 57 57
   1568         // 64 74 65 75 66 76 67 77
   1569         const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
   1570         const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   1571         const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
   1572         const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
   1573         const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
   1574         const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
   1575         const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
   1576         const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
   1577         // 00 10 20 30 01 11 21 31
   1578         // 40 50 60 70 41 51 61 71
   1579         // 02 12 22 32 03 13 23 33
   1580         // 42 52 62 72 43 53 63 73
   1581         // 04 14 24 34 05 15 21 36
   1582         // 44 54 64 74 45 55 61 76
   1583         // 06 16 26 36 07 17 27 37
   1584         // 46 56 66 76 47 57 67 77
   1585         const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
   1586         const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
   1587         const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
   1588         const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
   1589         const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
   1590         const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
   1591         const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
   1592         const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
   1593         // 00 10 20 30 40 50 60 70
   1594         // 01 11 21 31 41 51 61 71
   1595         // 02 12 22 32 42 52 62 72
   1596         // 03 13 23 33 43 53 63 73
   1597         // 04 14 24 34 44 54 64 74
   1598         // 05 15 25 35 45 55 65 75
   1599         // 06 16 26 36 46 56 66 76
   1600         // 07 17 27 37 47 57 67 77
   1601         _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
   1602         _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
   1603         _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
   1604         _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
   1605         _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
   1606         _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
   1607         _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
   1608         _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
   1609       }
   1610       {
   1611         // 00 01 02 03 04 05 06 07
   1612         // 10 11 12 13 14 15 16 17
   1613         // 20 21 22 23 24 25 26 27
   1614         // 30 31 32 33 34 35 36 37
   1615         // 40 41 42 43 44 45 46 47
   1616         // 50 51 52 53 54 55 56 57
   1617         // 60 61 62 63 64 65 66 67
   1618         // 70 71 72 73 74 75 76 77
   1619         const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
   1620         const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
   1621         const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
   1622         const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
   1623         const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
   1624         const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
   1625         const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
   1626         const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
   1627         // 00 10 01 11 02 12 03 13
   1628         // 20 30 21 31 22 32 23 33
   1629         // 04 14 05 15 06 16 07 17
   1630         // 24 34 25 35 26 36 27 37
   1631         // 40 50 41 51 42 52 43 53
   1632         // 60 70 61 71 62 72 63 73
   1633         // 54 54 55 55 56 56 57 57
   1634         // 64 74 65 75 66 76 67 77
   1635         const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
   1636         const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   1637         const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
   1638         const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
   1639         const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
   1640         const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
   1641         const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
   1642         const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
   1643         // 00 10 20 30 01 11 21 31
   1644         // 40 50 60 70 41 51 61 71
   1645         // 02 12 22 32 03 13 23 33
   1646         // 42 52 62 72 43 53 63 73
   1647         // 04 14 24 34 05 15 21 36
   1648         // 44 54 64 74 45 55 61 76
   1649         // 06 16 26 36 07 17 27 37
   1650         // 46 56 66 76 47 57 67 77
   1651         const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
   1652         const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
   1653         const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
   1654         const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
   1655         const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
   1656         const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
   1657         const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
   1658         const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
   1659         // 00 10 20 30 40 50 60 70
   1660         // 01 11 21 31 41 51 61 71
   1661         // 02 12 22 32 42 52 62 72
   1662         // 03 13 23 33 43 53 63 73
   1663         // 04 14 24 34 44 54 64 74
   1664         // 05 15 25 35 45 55 65 75
   1665         // 06 16 26 36 46 56 66 76
   1666         // 07 17 27 37 47 57 67 77
   1667         // Store results
   1668         _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
   1669         _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
   1670         _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
   1671         _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
   1672         _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
   1673         _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
   1674         _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
   1675         _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
   1676       }
   1677       out += 8*16;
   1678     }
   1679     // Setup in/out for next pass.
   1680     in = intermediate;
   1681     out = output;
   1682   }
   1683 }
   1684 
   1685 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
   1686                                      __m128i *in1, int stride) {
   1687   // load first 8 columns
   1688   load_buffer_8x8(input, in0, stride);
   1689   load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
   1690 
   1691   input += 8;
   1692   // load second 8 columns
   1693   load_buffer_8x8(input, in1, stride);
   1694   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
   1695 }
   1696 
   1697 static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
   1698                                       __m128i *in1, int stride) {
   1699   // write first 8 columns
   1700   write_buffer_8x8(output, in0, stride);
   1701   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
   1702   // write second 8 columns
   1703   output += 8;
   1704   write_buffer_8x8(output, in1, stride);
   1705   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
   1706 }
   1707 
   1708 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   1709   __m128i tbuf[8];
   1710   array_transpose_8x8(res0, res0);
   1711   array_transpose_8x8(res1, tbuf);
   1712   array_transpose_8x8(res0 + 8, res1);
   1713   array_transpose_8x8(res1 + 8, res1 + 8);
   1714 
   1715   res0[8] = tbuf[0];
   1716   res0[9] = tbuf[1];
   1717   res0[10] = tbuf[2];
   1718   res0[11] = tbuf[3];
   1719   res0[12] = tbuf[4];
   1720   res0[13] = tbuf[5];
   1721   res0[14] = tbuf[6];
   1722   res0[15] = tbuf[7];
   1723 }
   1724 
   1725 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   1726   // perform rounding operations
   1727   right_shift_8x8(res0, 2);
   1728   right_shift_8x8(res0 + 8, 2);
   1729   right_shift_8x8(res1, 2);
   1730   right_shift_8x8(res1 + 8, 2);
   1731 }
   1732 
   1733 void fdct16_1d_8col(__m128i *in) {
   1734   // perform 16x16 1-D DCT for 8 columns
   1735   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   1736   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   1737   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1738   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1739   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1740   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   1741   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1742   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   1743   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   1744   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
   1745   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   1746   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
   1747   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
   1748   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
   1749   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
   1750   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
   1751   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
   1752   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
   1753   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
   1754   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1755 
   1756   // stage 1
   1757   i[0] = _mm_add_epi16(in[0], in[15]);
   1758   i[1] = _mm_add_epi16(in[1], in[14]);
   1759   i[2] = _mm_add_epi16(in[2], in[13]);
   1760   i[3] = _mm_add_epi16(in[3], in[12]);
   1761   i[4] = _mm_add_epi16(in[4], in[11]);
   1762   i[5] = _mm_add_epi16(in[5], in[10]);
   1763   i[6] = _mm_add_epi16(in[6], in[9]);
   1764   i[7] = _mm_add_epi16(in[7], in[8]);
   1765 
   1766   s[0] = _mm_sub_epi16(in[7], in[8]);
   1767   s[1] = _mm_sub_epi16(in[6], in[9]);
   1768   s[2] = _mm_sub_epi16(in[5], in[10]);
   1769   s[3] = _mm_sub_epi16(in[4], in[11]);
   1770   s[4] = _mm_sub_epi16(in[3], in[12]);
   1771   s[5] = _mm_sub_epi16(in[2], in[13]);
   1772   s[6] = _mm_sub_epi16(in[1], in[14]);
   1773   s[7] = _mm_sub_epi16(in[0], in[15]);
   1774 
   1775   p[0] = _mm_add_epi16(i[0], i[7]);
   1776   p[1] = _mm_add_epi16(i[1], i[6]);
   1777   p[2] = _mm_add_epi16(i[2], i[5]);
   1778   p[3] = _mm_add_epi16(i[3], i[4]);
   1779   p[4] = _mm_sub_epi16(i[3], i[4]);
   1780   p[5] = _mm_sub_epi16(i[2], i[5]);
   1781   p[6] = _mm_sub_epi16(i[1], i[6]);
   1782   p[7] = _mm_sub_epi16(i[0], i[7]);
   1783 
   1784   u[0] = _mm_add_epi16(p[0], p[3]);
   1785   u[1] = _mm_add_epi16(p[1], p[2]);
   1786   u[2] = _mm_sub_epi16(p[1], p[2]);
   1787   u[3] = _mm_sub_epi16(p[0], p[3]);
   1788 
   1789   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
   1790   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
   1791   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
   1792   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
   1793 
   1794   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
   1795   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
   1796   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
   1797   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
   1798   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
   1799   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
   1800   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
   1801   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
   1802 
   1803   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1804   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1805   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1806   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1807   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1808   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1809   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1810   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1811 
   1812   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   1813   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   1814   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   1815   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   1816   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   1817   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   1818   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   1819   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   1820 
   1821   in[0] = _mm_packs_epi32(u[0], u[1]);
   1822   in[4] = _mm_packs_epi32(u[4], u[5]);
   1823   in[8] = _mm_packs_epi32(u[2], u[3]);
   1824   in[12] = _mm_packs_epi32(u[6], u[7]);
   1825 
   1826   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
   1827   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
   1828   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   1829   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   1830   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   1831   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   1832 
   1833   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1834   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1835   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1836   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1837 
   1838   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1839   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1840   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1841   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1842 
   1843   u[0] = _mm_packs_epi32(v[0], v[1]);
   1844   u[1] = _mm_packs_epi32(v[2], v[3]);
   1845 
   1846   t[0] = _mm_add_epi16(p[4], u[0]);
   1847   t[1] = _mm_sub_epi16(p[4], u[0]);
   1848   t[2] = _mm_sub_epi16(p[7], u[1]);
   1849   t[3] = _mm_add_epi16(p[7], u[1]);
   1850 
   1851   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
   1852   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
   1853   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
   1854   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
   1855 
   1856   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
   1857   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
   1858   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
   1859   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
   1860   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
   1861   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
   1862   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
   1863   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
   1864 
   1865   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1866   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1867   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1868   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1869   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1870   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1871   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1872   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1873 
   1874   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1875   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1876   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1877   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1878   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1879   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1880   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1881   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1882 
   1883   in[2] = _mm_packs_epi32(v[0], v[1]);
   1884   in[6] = _mm_packs_epi32(v[4], v[5]);
   1885   in[10] = _mm_packs_epi32(v[2], v[3]);
   1886   in[14] = _mm_packs_epi32(v[6], v[7]);
   1887 
   1888   // stage 2
   1889   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
   1890   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
   1891   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
   1892   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
   1893 
   1894   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   1895   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   1896   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   1897   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   1898   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   1899   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   1900   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   1901   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   1902 
   1903   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1904   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1905   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1906   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1907   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1908   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1909   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1910   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1911 
   1912   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1913   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1914   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1915   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1916   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1917   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1918   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1919   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1920 
   1921   t[2] = _mm_packs_epi32(v[0], v[1]);
   1922   t[3] = _mm_packs_epi32(v[2], v[3]);
   1923   t[4] = _mm_packs_epi32(v[4], v[5]);
   1924   t[5] = _mm_packs_epi32(v[6], v[7]);
   1925 
   1926   // stage 3
   1927   p[0] = _mm_add_epi16(s[0], t[3]);
   1928   p[1] = _mm_add_epi16(s[1], t[2]);
   1929   p[2] = _mm_sub_epi16(s[1], t[2]);
   1930   p[3] = _mm_sub_epi16(s[0], t[3]);
   1931   p[4] = _mm_sub_epi16(s[7], t[4]);
   1932   p[5] = _mm_sub_epi16(s[6], t[5]);
   1933   p[6] = _mm_add_epi16(s[6], t[5]);
   1934   p[7] = _mm_add_epi16(s[7], t[4]);
   1935 
   1936   // stage 4
   1937   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
   1938   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
   1939   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
   1940   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
   1941 
   1942   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
   1943   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
   1944   v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
   1945   v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
   1946   v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
   1947   v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
   1948   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
   1949   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
   1950 
   1951   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   1952   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   1953   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   1954   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   1955   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   1956   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   1957   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   1958   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   1959 
   1960   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   1961   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   1962   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   1963   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   1964   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   1965   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   1966   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   1967   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   1968 
   1969   t[1] = _mm_packs_epi32(v[0], v[1]);
   1970   t[2] = _mm_packs_epi32(v[2], v[3]);
   1971   t[5] = _mm_packs_epi32(v[4], v[5]);
   1972   t[6] = _mm_packs_epi32(v[6], v[7]);
   1973 
   1974   // stage 5
   1975   s[0] = _mm_add_epi16(p[0], t[1]);
   1976   s[1] = _mm_sub_epi16(p[0], t[1]);
   1977   s[2] = _mm_sub_epi16(p[3], t[2]);
   1978   s[3] = _mm_add_epi16(p[3], t[2]);
   1979   s[4] = _mm_add_epi16(p[4], t[5]);
   1980   s[5] = _mm_sub_epi16(p[4], t[5]);
   1981   s[6] = _mm_sub_epi16(p[7], t[6]);
   1982   s[7] = _mm_add_epi16(p[7], t[6]);
   1983 
   1984   // stage 6
   1985   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
   1986   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
   1987   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
   1988   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
   1989   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
   1990   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
   1991   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
   1992   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
   1993 
   1994   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
   1995   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
   1996   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
   1997   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
   1998   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
   1999   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
   2000   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
   2001   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
   2002   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
   2003   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
   2004   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
   2005   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
   2006   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
   2007   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
   2008   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
   2009   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
   2010 
   2011   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2012   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2013   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2014   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2015   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2016   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2017   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2018   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2019   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2020   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2021   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2022   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2023   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2024   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2025   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2026   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2027 
   2028   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2029   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2030   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2031   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2032   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2033   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2034   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2035   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2036   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2037   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2038   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2039   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2040   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2041   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2042   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2043   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2044 
   2045   in[1]  = _mm_packs_epi32(v[0], v[1]);
   2046   in[9]  = _mm_packs_epi32(v[2], v[3]);
   2047   in[5]  = _mm_packs_epi32(v[4], v[5]);
   2048   in[13] = _mm_packs_epi32(v[6], v[7]);
   2049   in[3]  = _mm_packs_epi32(v[8], v[9]);
   2050   in[11] = _mm_packs_epi32(v[10], v[11]);
   2051   in[7]  = _mm_packs_epi32(v[12], v[13]);
   2052   in[15] = _mm_packs_epi32(v[14], v[15]);
   2053 }
   2054 
   2055 void fadst16_1d_8col(__m128i *in) {
   2056   // perform 16x16 1-D ADST for 8 columns
   2057   __m128i s[16], x[16], u[32], v[32];
   2058   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
   2059   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   2060   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
   2061   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   2062   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
   2063   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   2064   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
   2065   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   2066   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
   2067   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   2068   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   2069   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   2070   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
   2071   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   2072   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
   2073   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   2074   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   2075   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   2076   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
   2077   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   2078   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
   2079   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
   2080   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   2081   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   2082   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   2083   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
   2084   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   2085   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   2086   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   2087   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   2088   const __m128i kZero = _mm_set1_epi16(0);
   2089 
   2090   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   2091   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
   2092   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
   2093   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
   2094   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
   2095   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
   2096   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
   2097   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
   2098   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
   2099   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
   2100   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
   2101   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
   2102   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
   2103   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
   2104   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
   2105   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
   2106 
   2107   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
   2108   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
   2109   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
   2110   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
   2111   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
   2112   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
   2113   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
   2114   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
   2115   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
   2116   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
   2117   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
   2118   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
   2119   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
   2120   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
   2121   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
   2122   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
   2123   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
   2124   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
   2125   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
   2126   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
   2127   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
   2128   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
   2129   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
   2130   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
   2131   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
   2132   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
   2133   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
   2134   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
   2135   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
   2136   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
   2137   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
   2138   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
   2139 
   2140   u[0] = _mm_add_epi32(v[0], v[16]);
   2141   u[1] = _mm_add_epi32(v[1], v[17]);
   2142   u[2] = _mm_add_epi32(v[2], v[18]);
   2143   u[3] = _mm_add_epi32(v[3], v[19]);
   2144   u[4] = _mm_add_epi32(v[4], v[20]);
   2145   u[5] = _mm_add_epi32(v[5], v[21]);
   2146   u[6] = _mm_add_epi32(v[6], v[22]);
   2147   u[7] = _mm_add_epi32(v[7], v[23]);
   2148   u[8] = _mm_add_epi32(v[8], v[24]);
   2149   u[9] = _mm_add_epi32(v[9], v[25]);
   2150   u[10] = _mm_add_epi32(v[10], v[26]);
   2151   u[11] = _mm_add_epi32(v[11], v[27]);
   2152   u[12] = _mm_add_epi32(v[12], v[28]);
   2153   u[13] = _mm_add_epi32(v[13], v[29]);
   2154   u[14] = _mm_add_epi32(v[14], v[30]);
   2155   u[15] = _mm_add_epi32(v[15], v[31]);
   2156   u[16] = _mm_sub_epi32(v[0], v[16]);
   2157   u[17] = _mm_sub_epi32(v[1], v[17]);
   2158   u[18] = _mm_sub_epi32(v[2], v[18]);
   2159   u[19] = _mm_sub_epi32(v[3], v[19]);
   2160   u[20] = _mm_sub_epi32(v[4], v[20]);
   2161   u[21] = _mm_sub_epi32(v[5], v[21]);
   2162   u[22] = _mm_sub_epi32(v[6], v[22]);
   2163   u[23] = _mm_sub_epi32(v[7], v[23]);
   2164   u[24] = _mm_sub_epi32(v[8], v[24]);
   2165   u[25] = _mm_sub_epi32(v[9], v[25]);
   2166   u[26] = _mm_sub_epi32(v[10], v[26]);
   2167   u[27] = _mm_sub_epi32(v[11], v[27]);
   2168   u[28] = _mm_sub_epi32(v[12], v[28]);
   2169   u[29] = _mm_sub_epi32(v[13], v[29]);
   2170   u[30] = _mm_sub_epi32(v[14], v[30]);
   2171   u[31] = _mm_sub_epi32(v[15], v[31]);
   2172 
   2173   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   2174   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   2175   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   2176   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   2177   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   2178   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   2179   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   2180   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   2181   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   2182   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   2183   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   2184   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   2185   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   2186   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   2187   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   2188   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   2189   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
   2190   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
   2191   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
   2192   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
   2193   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
   2194   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
   2195   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
   2196   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
   2197   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
   2198   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
   2199   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
   2200   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
   2201   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
   2202   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
   2203   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
   2204   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
   2205 
   2206   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   2207   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   2208   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   2209   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   2210   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   2211   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   2212   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   2213   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   2214   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   2215   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   2216   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   2217   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   2218   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   2219   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   2220   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   2221   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   2222   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
   2223   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
   2224   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
   2225   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
   2226   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
   2227   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
   2228   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
   2229   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
   2230   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
   2231   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
   2232   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
   2233   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
   2234   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
   2235   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
   2236   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
   2237   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
   2238 
   2239   s[0] = _mm_packs_epi32(u[0], u[1]);
   2240   s[1] = _mm_packs_epi32(u[2], u[3]);
   2241   s[2] = _mm_packs_epi32(u[4], u[5]);
   2242   s[3] = _mm_packs_epi32(u[6], u[7]);
   2243   s[4] = _mm_packs_epi32(u[8], u[9]);
   2244   s[5] = _mm_packs_epi32(u[10], u[11]);
   2245   s[6] = _mm_packs_epi32(u[12], u[13]);
   2246   s[7] = _mm_packs_epi32(u[14], u[15]);
   2247   s[8] = _mm_packs_epi32(u[16], u[17]);
   2248   s[9] = _mm_packs_epi32(u[18], u[19]);
   2249   s[10] = _mm_packs_epi32(u[20], u[21]);
   2250   s[11] = _mm_packs_epi32(u[22], u[23]);
   2251   s[12] = _mm_packs_epi32(u[24], u[25]);
   2252   s[13] = _mm_packs_epi32(u[26], u[27]);
   2253   s[14] = _mm_packs_epi32(u[28], u[29]);
   2254   s[15] = _mm_packs_epi32(u[30], u[31]);
   2255 
   2256   // stage 2
   2257   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
   2258   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
   2259   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
   2260   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
   2261   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
   2262   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
   2263   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   2264   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   2265 
   2266   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
   2267   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
   2268   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
   2269   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
   2270   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
   2271   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
   2272   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
   2273   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
   2274   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
   2275   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
   2276   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
   2277   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
   2278   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
   2279   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
   2280   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
   2281   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
   2282 
   2283   u[0] = _mm_add_epi32(v[0], v[8]);
   2284   u[1] = _mm_add_epi32(v[1], v[9]);
   2285   u[2] = _mm_add_epi32(v[2], v[10]);
   2286   u[3] = _mm_add_epi32(v[3], v[11]);
   2287   u[4] = _mm_add_epi32(v[4], v[12]);
   2288   u[5] = _mm_add_epi32(v[5], v[13]);
   2289   u[6] = _mm_add_epi32(v[6], v[14]);
   2290   u[7] = _mm_add_epi32(v[7], v[15]);
   2291   u[8] = _mm_sub_epi32(v[0], v[8]);
   2292   u[9] = _mm_sub_epi32(v[1], v[9]);
   2293   u[10] = _mm_sub_epi32(v[2], v[10]);
   2294   u[11] = _mm_sub_epi32(v[3], v[11]);
   2295   u[12] = _mm_sub_epi32(v[4], v[12]);
   2296   u[13] = _mm_sub_epi32(v[5], v[13]);
   2297   u[14] = _mm_sub_epi32(v[6], v[14]);
   2298   u[15] = _mm_sub_epi32(v[7], v[15]);
   2299 
   2300   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   2301   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   2302   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   2303   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   2304   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   2305   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   2306   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   2307   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   2308   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   2309   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   2310   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   2311   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   2312   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   2313   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   2314   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   2315   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   2316 
   2317   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   2318   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   2319   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   2320   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   2321   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   2322   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   2323   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   2324   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   2325   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   2326   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   2327   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   2328   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   2329   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   2330   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   2331   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   2332   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   2333 
   2334   x[0] = _mm_add_epi16(s[0], s[4]);
   2335   x[1] = _mm_add_epi16(s[1], s[5]);
   2336   x[2] = _mm_add_epi16(s[2], s[6]);
   2337   x[3] = _mm_add_epi16(s[3], s[7]);
   2338   x[4] = _mm_sub_epi16(s[0], s[4]);
   2339   x[5] = _mm_sub_epi16(s[1], s[5]);
   2340   x[6] = _mm_sub_epi16(s[2], s[6]);
   2341   x[7] = _mm_sub_epi16(s[3], s[7]);
   2342   x[8] = _mm_packs_epi32(u[0], u[1]);
   2343   x[9] = _mm_packs_epi32(u[2], u[3]);
   2344   x[10] = _mm_packs_epi32(u[4], u[5]);
   2345   x[11] = _mm_packs_epi32(u[6], u[7]);
   2346   x[12] = _mm_packs_epi32(u[8], u[9]);
   2347   x[13] = _mm_packs_epi32(u[10], u[11]);
   2348   x[14] = _mm_packs_epi32(u[12], u[13]);
   2349   x[15] = _mm_packs_epi32(u[14], u[15]);
   2350 
   2351   // stage 3
   2352   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
   2353   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
   2354   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
   2355   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
   2356   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
   2357   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
   2358   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
   2359   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
   2360 
   2361   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
   2362   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
   2363   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
   2364   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
   2365   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
   2366   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
   2367   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
   2368   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
   2369   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
   2370   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
   2371   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
   2372   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
   2373   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
   2374   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
   2375   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
   2376   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
   2377 
   2378   u[0] = _mm_add_epi32(v[0], v[4]);
   2379   u[1] = _mm_add_epi32(v[1], v[5]);
   2380   u[2] = _mm_add_epi32(v[2], v[6]);
   2381   u[3] = _mm_add_epi32(v[3], v[7]);
   2382   u[4] = _mm_sub_epi32(v[0], v[4]);
   2383   u[5] = _mm_sub_epi32(v[1], v[5]);
   2384   u[6] = _mm_sub_epi32(v[2], v[6]);
   2385   u[7] = _mm_sub_epi32(v[3], v[7]);
   2386   u[8] = _mm_add_epi32(v[8], v[12]);
   2387   u[9] = _mm_add_epi32(v[9], v[13]);
   2388   u[10] = _mm_add_epi32(v[10], v[14]);
   2389   u[11] = _mm_add_epi32(v[11], v[15]);
   2390   u[12] = _mm_sub_epi32(v[8], v[12]);
   2391   u[13] = _mm_sub_epi32(v[9], v[13]);
   2392   u[14] = _mm_sub_epi32(v[10], v[14]);
   2393   u[15] = _mm_sub_epi32(v[11], v[15]);
   2394 
   2395   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   2396   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   2397   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   2398   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   2399   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   2400   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   2401   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   2402   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   2403   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   2404   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   2405   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   2406   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   2407   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   2408   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   2409   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   2410   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   2411 
   2412   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2413   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2414   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2415   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2416   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2417   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2418   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2419   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2420   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2421   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2422   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2423   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2424   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2425   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2426   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2427   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2428 
   2429   s[0] = _mm_add_epi16(x[0], x[2]);
   2430   s[1] = _mm_add_epi16(x[1], x[3]);
   2431   s[2] = _mm_sub_epi16(x[0], x[2]);
   2432   s[3] = _mm_sub_epi16(x[1], x[3]);
   2433   s[4] = _mm_packs_epi32(v[0], v[1]);
   2434   s[5] = _mm_packs_epi32(v[2], v[3]);
   2435   s[6] = _mm_packs_epi32(v[4], v[5]);
   2436   s[7] = _mm_packs_epi32(v[6], v[7]);
   2437   s[8] = _mm_add_epi16(x[8], x[10]);
   2438   s[9] = _mm_add_epi16(x[9], x[11]);
   2439   s[10] = _mm_sub_epi16(x[8], x[10]);
   2440   s[11] = _mm_sub_epi16(x[9], x[11]);
   2441   s[12] = _mm_packs_epi32(v[8], v[9]);
   2442   s[13] = _mm_packs_epi32(v[10], v[11]);
   2443   s[14] = _mm_packs_epi32(v[12], v[13]);
   2444   s[15] = _mm_packs_epi32(v[14], v[15]);
   2445 
   2446   // stage 4
   2447   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
   2448   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
   2449   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
   2450   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
   2451   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
   2452   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
   2453   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   2454   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   2455 
   2456   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
   2457   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
   2458   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   2459   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
   2460   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   2461   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   2462   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   2463   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   2464   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
   2465   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
   2466   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
   2467   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
   2468   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
   2469   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
   2470   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
   2471   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
   2472 
   2473   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2474   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2475   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2476   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2477   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2478   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2479   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2480   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2481   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2482   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2483   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2484   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2485   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2486   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2487   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2488   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2489 
   2490   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2491   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2492   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2493   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2494   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2495   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2496   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2497   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2498   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2499   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2500   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2501   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2502   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2503   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2504   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2505   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2506 
   2507   in[0] = s[0];
   2508   in[1] = _mm_sub_epi16(kZero, s[8]);
   2509   in[2] = s[12];
   2510   in[3] = _mm_sub_epi16(kZero, s[4]);
   2511   in[4] = _mm_packs_epi32(v[4], v[5]);
   2512   in[5] = _mm_packs_epi32(v[12], v[13]);
   2513   in[6] = _mm_packs_epi32(v[8], v[9]);
   2514   in[7] = _mm_packs_epi32(v[0], v[1]);
   2515   in[8] = _mm_packs_epi32(v[2], v[3]);
   2516   in[9] = _mm_packs_epi32(v[10], v[11]);
   2517   in[10] = _mm_packs_epi32(v[14], v[15]);
   2518   in[11] = _mm_packs_epi32(v[6], v[7]);
   2519   in[12] = s[5];
   2520   in[13] = _mm_sub_epi16(kZero, s[13]);
   2521   in[14] = s[9];
   2522   in[15] = _mm_sub_epi16(kZero, s[1]);
   2523 }
   2524 
   2525 void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
   2526   fdct16_1d_8col(in0);
   2527   fdct16_1d_8col(in1);
   2528   array_transpose_16x16(in0, in1);
   2529 }
   2530 
   2531 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
   2532   fadst16_1d_8col(in0);
   2533   fadst16_1d_8col(in1);
   2534   array_transpose_16x16(in0, in1);
   2535 }
   2536 
   2537 void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
   2538                              int stride, int tx_type) {
   2539   __m128i in0[16], in1[16];
   2540   load_buffer_16x16(input, in0, in1, stride);
   2541   switch (tx_type) {
   2542     case 0:  // DCT_DCT
   2543       fdct16_1d_sse2(in0, in1);
   2544       right_shift_16x16(in0, in1);
   2545       fdct16_1d_sse2(in0, in1);
   2546       break;
   2547     case 1:  // ADST_DCT
   2548       fadst16_1d_sse2(in0, in1);
   2549       right_shift_16x16(in0, in1);
   2550       fdct16_1d_sse2(in0, in1);
   2551       break;
   2552     case 2:  // DCT_ADST
   2553       fdct16_1d_sse2(in0, in1);
   2554       right_shift_16x16(in0, in1);
   2555       fadst16_1d_sse2(in0, in1);
   2556       break;
   2557     case 3:  // ADST_ADST
   2558       fadst16_1d_sse2(in0, in1);
   2559       right_shift_16x16(in0, in1);
   2560       fadst16_1d_sse2(in0, in1);
   2561       break;
   2562     default:
   2563       assert(0);
   2564       break;
   2565   }
   2566   write_buffer_16x16(output, in0, in1, 16);
   2567 }
   2568 
   2569 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
   2570 #define FDCT32x32_HIGH_PRECISION 0
   2571 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
   2572 #undef  FDCT32x32_2D
   2573 #undef  FDCT32x32_HIGH_PRECISION
   2574 
   2575 #define FDCT32x32_2D vp9_fdct32x32_sse2
   2576 #define FDCT32x32_HIGH_PRECISION 1
   2577 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
   2578 #undef  FDCT32x32_2D
   2579 #undef  FDCT32x32_HIGH_PRECISION
   2580