Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 #include "vp9/common/vp9_idct.h"  // for cospi constants
     13 #include "vpx_ports/mem.h"
     14 
     15 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
     16 
     17 void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
     18   __m128i in0, in1;
     19   __m128i tmp;
     20   const __m128i zero = _mm_setzero_si128();
     21   in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
     22   in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
     23   in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
     24          (input +  2 * stride)));
     25   in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
     26          (input +  3 * stride)));
     27 
     28   tmp = _mm_add_epi16(in0, in1);
     29   in0 = _mm_unpacklo_epi16(zero, tmp);
     30   in1 = _mm_unpackhi_epi16(zero, tmp);
     31   in0 = _mm_srai_epi32(in0, 16);
     32   in1 = _mm_srai_epi32(in1, 16);
     33 
     34   tmp = _mm_add_epi32(in0, in1);
     35   in0 = _mm_unpacklo_epi32(tmp, zero);
     36   in1 = _mm_unpackhi_epi32(tmp, zero);
     37 
     38   tmp = _mm_add_epi32(in0, in1);
     39   in0 = _mm_srli_si128(tmp, 8);
     40 
     41   in1 = _mm_add_epi32(tmp, in0);
     42   in0 = _mm_slli_epi32(in1, 1);
     43   _mm_store_si128((__m128i *)(output), in0);
     44 }
     45 
     46 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
     47   // This 2D transform implements 4 vertical 1D transforms followed
     48   // by 4 horizontal 1D transforms.  The multiplies and adds are as given
     49   // by Chen, Smith and Fralick ('77).  The commands for moving the data
     50   // around have been minimized by hand.
     51   // For the purposes of the comments, the 16 inputs are referred to at i0
     52   // through iF (in raster order), intermediate variables are a0, b0, c0
     53   // through f, and correspond to the in-place computations mapped to input
     54   // locations.  The outputs, o0 through oF are labeled according to the
     55   // output locations.
     56 
     57   // Constants
     58   // These are the coefficients used for the multiplies.
     59   // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
     60   // where cospi_N_64 = cos(N pi /64)
     61   const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
     62                                             cospi_16_64, cospi_16_64,
     63                                             cospi_16_64, -cospi_16_64,
     64                                             cospi_16_64, -cospi_16_64);
     65   const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
     66                                             cospi_16_64, -cospi_16_64,
     67                                             cospi_16_64, cospi_16_64,
     68                                             cospi_16_64, cospi_16_64);
     69   const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
     70                                             cospi_8_64, cospi_24_64,
     71                                             cospi_24_64, -cospi_8_64,
     72                                             cospi_24_64, -cospi_8_64);
     73   const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
     74                                             cospi_24_64, -cospi_8_64,
     75                                             cospi_8_64, cospi_24_64,
     76                                             cospi_8_64, cospi_24_64);
     77   const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
     78                                             cospi_16_64, cospi_16_64,
     79                                             cospi_16_64, cospi_16_64,
     80                                             cospi_16_64, cospi_16_64);
     81   const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
     82                                             cospi_16_64, -cospi_16_64,
     83                                             cospi_16_64, -cospi_16_64,
     84                                             cospi_16_64, -cospi_16_64);
     85   const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
     86                                             cospi_8_64, cospi_24_64,
     87                                             -cospi_8_64, -cospi_24_64,
     88                                             -cospi_8_64, -cospi_24_64);
     89   const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
     90                                             cospi_24_64, -cospi_8_64,
     91                                             -cospi_24_64, cospi_8_64,
     92                                             -cospi_24_64, cospi_8_64);
     93 
     94   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     95   // This second rounding constant saves doing some extra adds at the end
     96   const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
     97                                                +(DCT_CONST_ROUNDING << 1));
     98   const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
     99   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
    100   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
    101   __m128i in0, in1;
    102 
    103   // Load inputs.
    104   {
    105     in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
    106     in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
    107     in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
    108            (input +  2 * stride)));
    109     in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
    110            (input +  3 * stride)));
    111     // in0 = [i0 i1 i2 i3 iC iD iE iF]
    112     // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
    113 
    114 
    115     // multiply by 16 to give some extra precision
    116     in0 = _mm_slli_epi16(in0, 4);
    117     in1 = _mm_slli_epi16(in1, 4);
    118     // if (i == 0 && input[0]) input[0] += 1;
    119     // add 1 to the upper left pixel if it is non-zero, which helps reduce
    120     // the round-trip error
    121     {
    122       // The mask will only contain whether the first value is zero, all
    123       // other comparison will fail as something shifted by 4 (above << 4)
    124       // can never be equal to one. To increment in the non-zero case, we
    125       // add the mask and one for the first element:
    126       //   - if zero, mask = -1, v = v - 1 + 1 = v
    127       //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
    128       __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
    129       in0 = _mm_add_epi16(in0, mask);
    130       in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
    131     }
    132   }
    133   // There are 4 total stages, alternating between an add/subtract stage
    134   // followed by an multiply-and-add stage.
    135   {
    136     // Stage 1: Add/subtract
    137 
    138     // in0 = [i0 i1 i2 i3 iC iD iE iF]
    139     // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
    140     const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
    141     const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
    142     // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
    143     // r1 = [iC i8 iD i9 iE iA iF iB]
    144     const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
    145     const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
    146     // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
    147     // r3 = [iC i8 iD i9 iF iB iE iA]
    148 
    149     const __m128i t0 = _mm_add_epi16(r2, r3);
    150     const __m128i t1 = _mm_sub_epi16(r2, r3);
    151     // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
    152     // t1 = [aC a8 aD a9 aF aB aE aA]
    153 
    154     // Stage 2: multiply by constants (which gets us into 32 bits).
    155     // The constants needed here are:
    156     // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
    157     // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
    158     // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
    159     // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
    160     const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
    161     const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
    162     const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
    163     const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
    164     // Then add and right-shift to get back to 16-bit range
    165     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    166     const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    167     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    168     const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    169     const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    170     const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    171     const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    172     const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    173     // w0 = [b0 b1 b7 b6]
    174     // w1 = [b8 b9 bF bE]
    175     // w2 = [b4 b5 b3 b2]
    176     // w3 = [bC bD bB bA]
    177     const __m128i x0 = _mm_packs_epi32(w0, w1);
    178     const __m128i x1 = _mm_packs_epi32(w2, w3);
    179     // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
    180     // x1 = [b4 b5 b3 b2 bC bD bB bA]
    181     in0 = _mm_shuffle_epi32(x0, 0xD8);
    182     in1 = _mm_shuffle_epi32(x1, 0x8D);
    183     // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
    184     // in1 = [b3 b2 bB bA b4 b5 bC bD]
    185   }
    186   {
    187     // vertical DCTs finished. Now we do the horizontal DCTs.
    188     // Stage 3: Add/subtract
    189 
    190     const __m128i t0 = _mm_add_epi16(in0, in1);
    191     const __m128i t1 = _mm_sub_epi16(in0, in1);
    192     // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
    193     // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
    194 
    195     // Stage 4: multiply by constants (which gets us into 32 bits).
    196     // The constants needed here are:
    197     // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
    198     // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
    199     // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
    200     // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
    201     const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
    202     const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
    203     const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
    204     const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
    205     // Then add and right-shift to get back to 16-bit range
    206     // but this combines the final right-shift as well to save operations
    207     // This unusual rounding operations is to maintain bit-accurate
    208     // compatibility with the c version of this function which has two
    209     // rounding steps in a row.
    210     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
    211     const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
    212     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
    213     const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
    214     const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
    215     const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
    216     const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
    217     const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
    218     // w0 = [o0 o4 o8 oC]
    219     // w1 = [o2 o6 oA oE]
    220     // w2 = [o1 o5 o9 oD]
    221     // w3 = [o3 o7 oB oF]
    222     // remember the o's are numbered according to the correct output location
    223     const __m128i x0 = _mm_packs_epi32(w0, w1);
    224     const __m128i x1 = _mm_packs_epi32(w2, w3);
    225     // x0 = [o0 o4 o8 oC o2 o6 oA oE]
    226     // x1 = [o1 o5 o9 oD o3 o7 oB oF]
    227     const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
    228     const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
    229     // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
    230     // y1 = [o2 o3 o6 o7 oA oB oE oF]
    231     in0 = _mm_unpacklo_epi32(y0, y1);
    232     // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
    233     in1 = _mm_unpackhi_epi32(y0, y1);
    234     // in1 = [o8 o9 oA oB oC oD oE oF]
    235   }
    236   // Post-condition (v + 1) >> 2 is now incorporated into previous
    237   // add and right-shift commands.  Only 2 store instructions needed
    238   // because we are using the fact that 1/3 are stored just after 0/2.
    239   {
    240      _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
    241      _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
    242   }
    243 }
    244 
    245 
    246 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
    247                                    int stride) {
    248   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
    249   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
    250   __m128i mask;
    251 
    252   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
    253   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
    254   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
    255   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
    256 
    257   in[0] = _mm_slli_epi16(in[0], 4);
    258   in[1] = _mm_slli_epi16(in[1], 4);
    259   in[2] = _mm_slli_epi16(in[2], 4);
    260   in[3] = _mm_slli_epi16(in[3], 4);
    261 
    262   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
    263   in[0] = _mm_add_epi16(in[0], mask);
    264   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
    265 }
    266 
    267 static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
    268   const __m128i kOne = _mm_set1_epi16(1);
    269   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
    270   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
    271   __m128i out01 = _mm_add_epi16(in01, kOne);
    272   __m128i out23 = _mm_add_epi16(in23, kOne);
    273   out01 = _mm_srai_epi16(out01, 2);
    274   out23 = _mm_srai_epi16(out23, 2);
    275   _mm_store_si128((__m128i *)(output + 0 * 8), out01);
    276   _mm_store_si128((__m128i *)(output + 1 * 8), out23);
    277 }
    278 
    279 static INLINE void transpose_4x4(__m128i *res) {
    280   // Combine and transpose
    281   // 00 01 02 03 20 21 22 23
    282   // 10 11 12 13 30 31 32 33
    283   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
    284   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
    285 
    286   // 00 10 01 11 02 12 03 13
    287   // 20 30 21 31 22 32 23 33
    288   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
    289   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
    290 
    291   // 00 10 20 30 01 11 21 31
    292   // 02 12 22 32 03 13 23 33
    293   // only use the first 4 16-bit integers
    294   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
    295   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
    296 }
    297 
    298 void fdct4_sse2(__m128i *in) {
    299   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    300   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    301   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    302   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    303   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    304 
    305   __m128i u[4], v[4];
    306   u[0]=_mm_unpacklo_epi16(in[0], in[1]);
    307   u[1]=_mm_unpacklo_epi16(in[3], in[2]);
    308 
    309   v[0] = _mm_add_epi16(u[0], u[1]);
    310   v[1] = _mm_sub_epi16(u[0], u[1]);
    311 
    312   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
    313   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
    314   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
    315   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
    316 
    317   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    318   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    319   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    320   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    321   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    322   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    323   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    324   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    325 
    326   in[0] = _mm_packs_epi32(u[0], u[1]);
    327   in[1] = _mm_packs_epi32(u[2], u[3]);
    328   transpose_4x4(in);
    329 }
    330 
    331 void fadst4_sse2(__m128i *in) {
    332   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
    333   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
    334   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
    335   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
    336   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
    337   const __m128i kZero = _mm_set1_epi16(0);
    338   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    339   __m128i u[8], v[8];
    340   __m128i in7 = _mm_add_epi16(in[0], in[1]);
    341 
    342   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
    343   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
    344   u[2] = _mm_unpacklo_epi16(in7, kZero);
    345   u[3] = _mm_unpacklo_epi16(in[2], kZero);
    346   u[4] = _mm_unpacklo_epi16(in[3], kZero);
    347 
    348   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
    349   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
    350   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
    351   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
    352   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
    353   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
    354   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
    355 
    356   u[0] = _mm_add_epi32(v[0], v[1]);
    357   u[1] = _mm_sub_epi32(v[2], v[6]);
    358   u[2] = _mm_add_epi32(v[3], v[4]);
    359   u[3] = _mm_sub_epi32(u[2], u[0]);
    360   u[4] = _mm_slli_epi32(v[5], 2);
    361   u[5] = _mm_sub_epi32(u[4], v[5]);
    362   u[6] = _mm_add_epi32(u[3], u[5]);
    363 
    364   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    365   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    366   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    367   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    368 
    369   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    370   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    371   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    372   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    373 
    374   in[0] = _mm_packs_epi32(u[0], u[2]);
    375   in[1] = _mm_packs_epi32(u[1], u[3]);
    376   transpose_4x4(in);
    377 }
    378 
    379 void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
    380                      int stride, int tx_type) {
    381   __m128i in[4];
    382 
    383   switch (tx_type) {
    384     case DCT_DCT:
    385       vp9_fdct4x4_sse2(input, output, stride);
    386       break;
    387     case ADST_DCT:
    388       load_buffer_4x4(input, in, stride);
    389       fadst4_sse2(in);
    390       fdct4_sse2(in);
    391       write_buffer_4x4(output, in);
    392       break;
    393     case DCT_ADST:
    394       load_buffer_4x4(input, in, stride);
    395       fdct4_sse2(in);
    396       fadst4_sse2(in);
    397       write_buffer_4x4(output, in);
    398       break;
    399     case ADST_ADST:
    400       load_buffer_4x4(input, in, stride);
    401       fadst4_sse2(in);
    402       fadst4_sse2(in);
    403       write_buffer_4x4(output, in);
    404       break;
    405    default:
    406      assert(0);
    407      break;
    408   }
    409 }
    410 
    411 void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
    412   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
    413   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
    414   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
    415   __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
    416   __m128i u0, u1, sum;
    417 
    418   u0 = _mm_add_epi16(in0, in1);
    419   u1 = _mm_add_epi16(in2, in3);
    420 
    421   in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
    422   in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
    423   in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
    424   in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
    425 
    426   sum = _mm_add_epi16(u0, u1);
    427 
    428   in0 = _mm_add_epi16(in0, in1);
    429   in2 = _mm_add_epi16(in2, in3);
    430   sum = _mm_add_epi16(sum, in0);
    431 
    432   u0  = _mm_setzero_si128();
    433   sum = _mm_add_epi16(sum, in2);
    434 
    435   in0 = _mm_unpacklo_epi16(u0, sum);
    436   in1 = _mm_unpackhi_epi16(u0, sum);
    437   in0 = _mm_srai_epi32(in0, 16);
    438   in1 = _mm_srai_epi32(in1, 16);
    439 
    440   sum = _mm_add_epi32(in0, in1);
    441   in0 = _mm_unpacklo_epi32(sum, u0);
    442   in1 = _mm_unpackhi_epi32(sum, u0);
    443 
    444   sum = _mm_add_epi32(in0, in1);
    445   in0 = _mm_srli_si128(sum, 8);
    446 
    447   in1 = _mm_add_epi32(sum, in0);
    448   _mm_store_si128((__m128i *)(output), in1);
    449 }
    450 
    451 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
    452   int pass;
    453   // Constants
    454   //    When we use them, in one case, they are all the same. In all others
    455   //    it's a pair of them that we need to repeat four times. This is done
    456   //    by constructing the 32 bit constant corresponding to that pair.
    457   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    458   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    459   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    460   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    461   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
    462   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    463   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
    464   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    465   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    466   // Load input
    467   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
    468   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
    469   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
    470   __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
    471   __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
    472   __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
    473   __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
    474   __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
    475   // Pre-condition input (shift by two)
    476   in0 = _mm_slli_epi16(in0, 2);
    477   in1 = _mm_slli_epi16(in1, 2);
    478   in2 = _mm_slli_epi16(in2, 2);
    479   in3 = _mm_slli_epi16(in3, 2);
    480   in4 = _mm_slli_epi16(in4, 2);
    481   in5 = _mm_slli_epi16(in5, 2);
    482   in6 = _mm_slli_epi16(in6, 2);
    483   in7 = _mm_slli_epi16(in7, 2);
    484 
    485   // We do two passes, first the columns, then the rows. The results of the
    486   // first pass are transposed so that the same column code can be reused. The
    487   // results of the second pass are also transposed so that the rows (processed
    488   // as columns) are put back in row positions.
    489   for (pass = 0; pass < 2; pass++) {
    490     // To store results of each pass before the transpose.
    491     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
    492     // Add/subtract
    493     const __m128i q0 = _mm_add_epi16(in0, in7);
    494     const __m128i q1 = _mm_add_epi16(in1, in6);
    495     const __m128i q2 = _mm_add_epi16(in2, in5);
    496     const __m128i q3 = _mm_add_epi16(in3, in4);
    497     const __m128i q4 = _mm_sub_epi16(in3, in4);
    498     const __m128i q5 = _mm_sub_epi16(in2, in5);
    499     const __m128i q6 = _mm_sub_epi16(in1, in6);
    500     const __m128i q7 = _mm_sub_epi16(in0, in7);
    501     // Work on first four results
    502     {
    503       // Add/subtract
    504       const __m128i r0 = _mm_add_epi16(q0, q3);
    505       const __m128i r1 = _mm_add_epi16(q1, q2);
    506       const __m128i r2 = _mm_sub_epi16(q1, q2);
    507       const __m128i r3 = _mm_sub_epi16(q0, q3);
    508       // Interleave to do the multiply by constants which gets us into 32bits
    509       const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
    510       const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
    511       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
    512       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
    513       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
    514       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
    515       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
    516       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
    517       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
    518       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
    519       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
    520       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
    521       // dct_const_round_shift
    522       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    523       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    524       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    525       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    526       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    527       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
    528       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    529       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
    530       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    531       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    532       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    533       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    534       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    535       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    536       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    537       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    538       // Combine
    539       res0 = _mm_packs_epi32(w0, w1);
    540       res4 = _mm_packs_epi32(w2, w3);
    541       res2 = _mm_packs_epi32(w4, w5);
    542       res6 = _mm_packs_epi32(w6, w7);
    543     }
    544     // Work on next four results
    545     {
    546       // Interleave to do the multiply by constants which gets us into 32bits
    547       const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
    548       const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
    549       const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
    550       const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
    551       const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
    552       const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
    553       // dct_const_round_shift
    554       const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
    555       const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
    556       const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
    557       const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
    558       const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
    559       const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
    560       const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
    561       const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
    562       // Combine
    563       const __m128i r0 = _mm_packs_epi32(s0, s1);
    564       const __m128i r1 = _mm_packs_epi32(s2, s3);
    565       // Add/subtract
    566       const __m128i x0 = _mm_add_epi16(q4, r0);
    567       const __m128i x1 = _mm_sub_epi16(q4, r0);
    568       const __m128i x2 = _mm_sub_epi16(q7, r1);
    569       const __m128i x3 = _mm_add_epi16(q7, r1);
    570       // Interleave to do the multiply by constants which gets us into 32bits
    571       const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
    572       const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
    573       const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
    574       const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
    575       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
    576       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
    577       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
    578       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
    579       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
    580       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
    581       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
    582       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
    583       // dct_const_round_shift
    584       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    585       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    586       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    587       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    588       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    589       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
    590       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    591       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
    592       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    593       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    594       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    595       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    596       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    597       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    598       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    599       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    600       // Combine
    601       res1 = _mm_packs_epi32(w0, w1);
    602       res7 = _mm_packs_epi32(w2, w3);
    603       res5 = _mm_packs_epi32(w4, w5);
    604       res3 = _mm_packs_epi32(w6, w7);
    605     }
    606     // Transpose the 8x8.
    607     {
    608       // 00 01 02 03 04 05 06 07
    609       // 10 11 12 13 14 15 16 17
    610       // 20 21 22 23 24 25 26 27
    611       // 30 31 32 33 34 35 36 37
    612       // 40 41 42 43 44 45 46 47
    613       // 50 51 52 53 54 55 56 57
    614       // 60 61 62 63 64 65 66 67
    615       // 70 71 72 73 74 75 76 77
    616       const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
    617       const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
    618       const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
    619       const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
    620       const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
    621       const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
    622       const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
    623       const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
    624       // 00 10 01 11 02 12 03 13
    625       // 20 30 21 31 22 32 23 33
    626       // 04 14 05 15 06 16 07 17
    627       // 24 34 25 35 26 36 27 37
    628       // 40 50 41 51 42 52 43 53
    629       // 60 70 61 71 62 72 63 73
    630       // 54 54 55 55 56 56 57 57
    631       // 64 74 65 75 66 76 67 77
    632       const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    633       const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
    634       const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    635       const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
    636       const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
    637       const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
    638       const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
    639       const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
    640       // 00 10 20 30 01 11 21 31
    641       // 40 50 60 70 41 51 61 71
    642       // 02 12 22 32 03 13 23 33
    643       // 42 52 62 72 43 53 63 73
    644       // 04 14 24 34 05 15 21 36
    645       // 44 54 64 74 45 55 61 76
    646       // 06 16 26 36 07 17 27 37
    647       // 46 56 66 76 47 57 67 77
    648       in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
    649       in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
    650       in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
    651       in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
    652       in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
    653       in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
    654       in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
    655       in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
    656       // 00 10 20 30 40 50 60 70
    657       // 01 11 21 31 41 51 61 71
    658       // 02 12 22 32 42 52 62 72
    659       // 03 13 23 33 43 53 63 73
    660       // 04 14 24 34 44 54 64 74
    661       // 05 15 25 35 45 55 65 75
    662       // 06 16 26 36 46 56 66 76
    663       // 07 17 27 37 47 57 67 77
    664     }
    665   }
    666   // Post-condition output and store it
    667   {
    668     // Post-condition (division by two)
    669     //    division of two 16 bits signed numbers using shifts
    670     //    n / 2 = (n - (n >> 15)) >> 1
    671     const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
    672     const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
    673     const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
    674     const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
    675     const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
    676     const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
    677     const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
    678     const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
    679     in0 = _mm_sub_epi16(in0, sign_in0);
    680     in1 = _mm_sub_epi16(in1, sign_in1);
    681     in2 = _mm_sub_epi16(in2, sign_in2);
    682     in3 = _mm_sub_epi16(in3, sign_in3);
    683     in4 = _mm_sub_epi16(in4, sign_in4);
    684     in5 = _mm_sub_epi16(in5, sign_in5);
    685     in6 = _mm_sub_epi16(in6, sign_in6);
    686     in7 = _mm_sub_epi16(in7, sign_in7);
    687     in0 = _mm_srai_epi16(in0, 1);
    688     in1 = _mm_srai_epi16(in1, 1);
    689     in2 = _mm_srai_epi16(in2, 1);
    690     in3 = _mm_srai_epi16(in3, 1);
    691     in4 = _mm_srai_epi16(in4, 1);
    692     in5 = _mm_srai_epi16(in5, 1);
    693     in6 = _mm_srai_epi16(in6, 1);
    694     in7 = _mm_srai_epi16(in7, 1);
    695     // store results
    696     _mm_store_si128((__m128i *)(output + 0 * 8), in0);
    697     _mm_store_si128((__m128i *)(output + 1 * 8), in1);
    698     _mm_store_si128((__m128i *)(output + 2 * 8), in2);
    699     _mm_store_si128((__m128i *)(output + 3 * 8), in3);
    700     _mm_store_si128((__m128i *)(output + 4 * 8), in4);
    701     _mm_store_si128((__m128i *)(output + 5 * 8), in5);
    702     _mm_store_si128((__m128i *)(output + 6 * 8), in6);
    703     _mm_store_si128((__m128i *)(output + 7 * 8), in7);
    704   }
    705 }
    706 
    707 // load 8x8 array
    708 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
    709                                    int stride) {
    710   in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
    711   in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
    712   in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
    713   in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
    714   in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
    715   in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
    716   in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
    717   in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
    718 
    719   in[0] = _mm_slli_epi16(in[0], 2);
    720   in[1] = _mm_slli_epi16(in[1], 2);
    721   in[2] = _mm_slli_epi16(in[2], 2);
    722   in[3] = _mm_slli_epi16(in[3], 2);
    723   in[4] = _mm_slli_epi16(in[4], 2);
    724   in[5] = _mm_slli_epi16(in[5], 2);
    725   in[6] = _mm_slli_epi16(in[6], 2);
    726   in[7] = _mm_slli_epi16(in[7], 2);
    727 }
    728 
    729 // right shift and rounding
    730 static INLINE void right_shift_8x8(__m128i *res, int const bit) {
    731   const __m128i kOne = _mm_set1_epi16(1);
    732   const int bit_m02 = bit - 2;
    733   __m128i sign0 = _mm_srai_epi16(res[0], 15);
    734   __m128i sign1 = _mm_srai_epi16(res[1], 15);
    735   __m128i sign2 = _mm_srai_epi16(res[2], 15);
    736   __m128i sign3 = _mm_srai_epi16(res[3], 15);
    737   __m128i sign4 = _mm_srai_epi16(res[4], 15);
    738   __m128i sign5 = _mm_srai_epi16(res[5], 15);
    739   __m128i sign6 = _mm_srai_epi16(res[6], 15);
    740   __m128i sign7 = _mm_srai_epi16(res[7], 15);
    741 
    742   if (bit_m02 >= 0) {
    743     __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
    744     res[0] = _mm_add_epi16(res[0], k_const_rounding);
    745     res[1] = _mm_add_epi16(res[1], k_const_rounding);
    746     res[2] = _mm_add_epi16(res[2], k_const_rounding);
    747     res[3] = _mm_add_epi16(res[3], k_const_rounding);
    748     res[4] = _mm_add_epi16(res[4], k_const_rounding);
    749     res[5] = _mm_add_epi16(res[5], k_const_rounding);
    750     res[6] = _mm_add_epi16(res[6], k_const_rounding);
    751     res[7] = _mm_add_epi16(res[7], k_const_rounding);
    752   }
    753 
    754   res[0] = _mm_sub_epi16(res[0], sign0);
    755   res[1] = _mm_sub_epi16(res[1], sign1);
    756   res[2] = _mm_sub_epi16(res[2], sign2);
    757   res[3] = _mm_sub_epi16(res[3], sign3);
    758   res[4] = _mm_sub_epi16(res[4], sign4);
    759   res[5] = _mm_sub_epi16(res[5], sign5);
    760   res[6] = _mm_sub_epi16(res[6], sign6);
    761   res[7] = _mm_sub_epi16(res[7], sign7);
    762 
    763   res[0] = _mm_srai_epi16(res[0], bit);
    764   res[1] = _mm_srai_epi16(res[1], bit);
    765   res[2] = _mm_srai_epi16(res[2], bit);
    766   res[3] = _mm_srai_epi16(res[3], bit);
    767   res[4] = _mm_srai_epi16(res[4], bit);
    768   res[5] = _mm_srai_epi16(res[5], bit);
    769   res[6] = _mm_srai_epi16(res[6], bit);
    770   res[7] = _mm_srai_epi16(res[7], bit);
    771 }
    772 
    773 // write 8x8 array
    774 static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
    775   _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
    776   _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
    777   _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
    778   _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
    779   _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
    780   _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
    781   _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
    782   _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
    783 }
    784 
    785 void fdct8_sse2(__m128i *in) {
    786   // constants
    787   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    788   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    789   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    790   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    791   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
    792   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    793   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
    794   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    795   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    796   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
    797   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
    798   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    799 
    800   // stage 1
    801   s0 = _mm_add_epi16(in[0], in[7]);
    802   s1 = _mm_add_epi16(in[1], in[6]);
    803   s2 = _mm_add_epi16(in[2], in[5]);
    804   s3 = _mm_add_epi16(in[3], in[4]);
    805   s4 = _mm_sub_epi16(in[3], in[4]);
    806   s5 = _mm_sub_epi16(in[2], in[5]);
    807   s6 = _mm_sub_epi16(in[1], in[6]);
    808   s7 = _mm_sub_epi16(in[0], in[7]);
    809 
    810   u0 = _mm_add_epi16(s0, s3);
    811   u1 = _mm_add_epi16(s1, s2);
    812   u2 = _mm_sub_epi16(s1, s2);
    813   u3 = _mm_sub_epi16(s0, s3);
    814   // interleave and perform butterfly multiplication/addition
    815   v0 = _mm_unpacklo_epi16(u0, u1);
    816   v1 = _mm_unpackhi_epi16(u0, u1);
    817   v2 = _mm_unpacklo_epi16(u2, u3);
    818   v3 = _mm_unpackhi_epi16(u2, u3);
    819 
    820   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
    821   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
    822   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
    823   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
    824   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
    825   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
    826   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
    827   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
    828 
    829   // shift and rounding
    830   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    831   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
    832   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    833   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
    834   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    835   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
    836   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    837   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
    838 
    839   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    840   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    841   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    842   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    843   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    844   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    845   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    846   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    847 
    848   in[0] = _mm_packs_epi32(u0, u1);
    849   in[2] = _mm_packs_epi32(u4, u5);
    850   in[4] = _mm_packs_epi32(u2, u3);
    851   in[6] = _mm_packs_epi32(u6, u7);
    852 
    853   // stage 2
    854   // interleave and perform butterfly multiplication/addition
    855   u0 = _mm_unpacklo_epi16(s6, s5);
    856   u1 = _mm_unpackhi_epi16(s6, s5);
    857   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    858   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    859   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    860   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    861 
    862   // shift and rounding
    863   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    864   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    865   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    866   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    867 
    868   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    869   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    870   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    871   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    872 
    873   u0 = _mm_packs_epi32(v0, v1);
    874   u1 = _mm_packs_epi32(v2, v3);
    875 
    876   // stage 3
    877   s0 = _mm_add_epi16(s4, u0);
    878   s1 = _mm_sub_epi16(s4, u0);
    879   s2 = _mm_sub_epi16(s7, u1);
    880   s3 = _mm_add_epi16(s7, u1);
    881 
    882   // stage 4
    883   u0 = _mm_unpacklo_epi16(s0, s3);
    884   u1 = _mm_unpackhi_epi16(s0, s3);
    885   u2 = _mm_unpacklo_epi16(s1, s2);
    886   u3 = _mm_unpackhi_epi16(s1, s2);
    887 
    888   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
    889   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
    890   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
    891   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
    892   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
    893   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
    894   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
    895   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
    896 
    897   // shift and rounding
    898   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    899   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    900   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    901   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    902   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    903   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    904   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    905   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    906 
    907   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    908   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    909   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    910   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    911   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    912   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    913   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    914   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    915 
    916   in[1] = _mm_packs_epi32(v0, v1);
    917   in[3] = _mm_packs_epi32(v4, v5);
    918   in[5] = _mm_packs_epi32(v2, v3);
    919   in[7] = _mm_packs_epi32(v6, v7);
    920 
    921   // transpose
    922   array_transpose_8x8(in, in);
    923 }
    924 
    925 void fadst8_sse2(__m128i *in) {
    926   // Constants
    927   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
    928   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    929   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
    930   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    931   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
    932   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    933   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
    934   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    935   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    936   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    937   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    938   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    939   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
    940   const __m128i k__const_0 = _mm_set1_epi16(0);
    941   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    942 
    943   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
    944   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    945   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
    946   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    947   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    948 
    949   // properly aligned for butterfly input
    950   in0  = in[7];
    951   in1  = in[0];
    952   in2  = in[5];
    953   in3  = in[2];
    954   in4  = in[3];
    955   in5  = in[4];
    956   in6  = in[1];
    957   in7  = in[6];
    958 
    959   // column transformation
    960   // stage 1
    961   // interleave and multiply/add into 32-bit integer
    962   s0 = _mm_unpacklo_epi16(in0, in1);
    963   s1 = _mm_unpackhi_epi16(in0, in1);
    964   s2 = _mm_unpacklo_epi16(in2, in3);
    965   s3 = _mm_unpackhi_epi16(in2, in3);
    966   s4 = _mm_unpacklo_epi16(in4, in5);
    967   s5 = _mm_unpackhi_epi16(in4, in5);
    968   s6 = _mm_unpacklo_epi16(in6, in7);
    969   s7 = _mm_unpackhi_epi16(in6, in7);
    970 
    971   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
    972   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
    973   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
    974   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
    975   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
    976   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
    977   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
    978   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
    979   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
    980   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
    981   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
    982   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
    983   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
    984   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
    985   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
    986   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
    987 
    988   // addition
    989   w0 = _mm_add_epi32(u0, u8);
    990   w1 = _mm_add_epi32(u1, u9);
    991   w2 = _mm_add_epi32(u2, u10);
    992   w3 = _mm_add_epi32(u3, u11);
    993   w4 = _mm_add_epi32(u4, u12);
    994   w5 = _mm_add_epi32(u5, u13);
    995   w6 = _mm_add_epi32(u6, u14);
    996   w7 = _mm_add_epi32(u7, u15);
    997   w8 = _mm_sub_epi32(u0, u8);
    998   w9 = _mm_sub_epi32(u1, u9);
    999   w10 = _mm_sub_epi32(u2, u10);
   1000   w11 = _mm_sub_epi32(u3, u11);
   1001   w12 = _mm_sub_epi32(u4, u12);
   1002   w13 = _mm_sub_epi32(u5, u13);
   1003   w14 = _mm_sub_epi32(u6, u14);
   1004   w15 = _mm_sub_epi32(u7, u15);
   1005 
   1006   // shift and rounding
   1007   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
   1008   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
   1009   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
   1010   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
   1011   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
   1012   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
   1013   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
   1014   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
   1015   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
   1016   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
   1017   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
   1018   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
   1019   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
   1020   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
   1021   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
   1022   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
   1023 
   1024   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1025   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1026   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1027   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1028   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
   1029   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
   1030   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
   1031   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   1032   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
   1033   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
   1034   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
   1035   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
   1036   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
   1037   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
   1038   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
   1039   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
   1040 
   1041   // back to 16-bit and pack 8 integers into __m128i
   1042   in[0] = _mm_packs_epi32(u0, u1);
   1043   in[1] = _mm_packs_epi32(u2, u3);
   1044   in[2] = _mm_packs_epi32(u4, u5);
   1045   in[3] = _mm_packs_epi32(u6, u7);
   1046   in[4] = _mm_packs_epi32(u8, u9);
   1047   in[5] = _mm_packs_epi32(u10, u11);
   1048   in[6] = _mm_packs_epi32(u12, u13);
   1049   in[7] = _mm_packs_epi32(u14, u15);
   1050 
   1051   // stage 2
   1052   s0 = _mm_add_epi16(in[0], in[2]);
   1053   s1 = _mm_add_epi16(in[1], in[3]);
   1054   s2 = _mm_sub_epi16(in[0], in[2]);
   1055   s3 = _mm_sub_epi16(in[1], in[3]);
   1056   u0 = _mm_unpacklo_epi16(in[4], in[5]);
   1057   u1 = _mm_unpackhi_epi16(in[4], in[5]);
   1058   u2 = _mm_unpacklo_epi16(in[6], in[7]);
   1059   u3 = _mm_unpackhi_epi16(in[6], in[7]);
   1060 
   1061   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
   1062   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
   1063   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
   1064   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
   1065   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
   1066   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
   1067   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
   1068   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
   1069 
   1070   w0 = _mm_add_epi32(v0, v4);
   1071   w1 = _mm_add_epi32(v1, v5);
   1072   w2 = _mm_add_epi32(v2, v6);
   1073   w3 = _mm_add_epi32(v3, v7);
   1074   w4 = _mm_sub_epi32(v0, v4);
   1075   w5 = _mm_sub_epi32(v1, v5);
   1076   w6 = _mm_sub_epi32(v2, v6);
   1077   w7 = _mm_sub_epi32(v3, v7);
   1078 
   1079   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
   1080   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
   1081   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
   1082   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
   1083   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
   1084   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
   1085   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
   1086   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
   1087 
   1088   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1089   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1090   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1091   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1092   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
   1093   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
   1094   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
   1095   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   1096 
   1097   // back to 16-bit intergers
   1098   s4 = _mm_packs_epi32(u0, u1);
   1099   s5 = _mm_packs_epi32(u2, u3);
   1100   s6 = _mm_packs_epi32(u4, u5);
   1101   s7 = _mm_packs_epi32(u6, u7);
   1102 
   1103   // stage 3
   1104   u0 = _mm_unpacklo_epi16(s2, s3);
   1105   u1 = _mm_unpackhi_epi16(s2, s3);
   1106   u2 = _mm_unpacklo_epi16(s6, s7);
   1107   u3 = _mm_unpackhi_epi16(s6, s7);
   1108 
   1109   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
   1110   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
   1111   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
   1112   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
   1113   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
   1114   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
   1115   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
   1116   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
   1117 
   1118   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
   1119   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
   1120   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
   1121   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
   1122   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
   1123   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
   1124   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
   1125   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
   1126 
   1127   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
   1128   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
   1129   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
   1130   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
   1131   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
   1132   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
   1133   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
   1134   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
   1135 
   1136   s2 = _mm_packs_epi32(v0, v1);
   1137   s3 = _mm_packs_epi32(v2, v3);
   1138   s6 = _mm_packs_epi32(v4, v5);
   1139   s7 = _mm_packs_epi32(v6, v7);
   1140 
   1141   // FIXME(jingning): do subtract using bit inversion?
   1142   in[0] = s0;
   1143   in[1] = _mm_sub_epi16(k__const_0, s4);
   1144   in[2] = s6;
   1145   in[3] = _mm_sub_epi16(k__const_0, s2);
   1146   in[4] = s3;
   1147   in[5] = _mm_sub_epi16(k__const_0, s7);
   1148   in[6] = s5;
   1149   in[7] = _mm_sub_epi16(k__const_0, s1);
   1150 
   1151   // transpose
   1152   array_transpose_8x8(in, in);
   1153 }
   1154 
   1155 void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
   1156                      int stride, int tx_type) {
   1157   __m128i in[8];
   1158 
   1159   switch (tx_type) {
   1160     case DCT_DCT:
   1161       vp9_fdct8x8_sse2(input, output, stride);
   1162       break;
   1163     case ADST_DCT:
   1164       load_buffer_8x8(input, in, stride);
   1165       fadst8_sse2(in);
   1166       fdct8_sse2(in);
   1167       right_shift_8x8(in, 1);
   1168       write_buffer_8x8(output, in, 8);
   1169       break;
   1170     case DCT_ADST:
   1171       load_buffer_8x8(input, in, stride);
   1172       fdct8_sse2(in);
   1173       fadst8_sse2(in);
   1174       right_shift_8x8(in, 1);
   1175       write_buffer_8x8(output, in, 8);
   1176       break;
   1177     case ADST_ADST:
   1178       load_buffer_8x8(input, in, stride);
   1179       fadst8_sse2(in);
   1180       fadst8_sse2(in);
   1181       right_shift_8x8(in, 1);
   1182       write_buffer_8x8(output, in, 8);
   1183       break;
   1184     default:
   1185       assert(0);
   1186       break;
   1187   }
   1188 }
   1189 
   1190 void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
   1191   __m128i in0, in1, in2, in3;
   1192   __m128i u0, u1;
   1193   __m128i sum = _mm_setzero_si128();
   1194   int i;
   1195 
   1196   for (i = 0; i < 2; ++i) {
   1197     input += 8 * i;
   1198     in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
   1199     in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
   1200     in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
   1201     in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
   1202 
   1203     u0 = _mm_add_epi16(in0, in1);
   1204     u1 = _mm_add_epi16(in2, in3);
   1205     sum = _mm_add_epi16(sum, u0);
   1206 
   1207     in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
   1208     in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
   1209     in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
   1210     in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
   1211 
   1212     sum = _mm_add_epi16(sum, u1);
   1213     u0  = _mm_add_epi16(in0, in1);
   1214     u1  = _mm_add_epi16(in2, in3);
   1215     sum = _mm_add_epi16(sum, u0);
   1216 
   1217     in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
   1218     in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
   1219     in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
   1220     in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
   1221 
   1222     sum = _mm_add_epi16(sum, u1);
   1223     u0  = _mm_add_epi16(in0, in1);
   1224     u1  = _mm_add_epi16(in2, in3);
   1225     sum = _mm_add_epi16(sum, u0);
   1226 
   1227     in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
   1228     in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
   1229     in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
   1230     in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
   1231 
   1232     sum = _mm_add_epi16(sum, u1);
   1233     u0  = _mm_add_epi16(in0, in1);
   1234     u1  = _mm_add_epi16(in2, in3);
   1235     sum = _mm_add_epi16(sum, u0);
   1236 
   1237     sum = _mm_add_epi16(sum, u1);
   1238   }
   1239 
   1240   u0  = _mm_setzero_si128();
   1241   in0 = _mm_unpacklo_epi16(u0, sum);
   1242   in1 = _mm_unpackhi_epi16(u0, sum);
   1243   in0 = _mm_srai_epi32(in0, 16);
   1244   in1 = _mm_srai_epi32(in1, 16);
   1245 
   1246   sum = _mm_add_epi32(in0, in1);
   1247   in0 = _mm_unpacklo_epi32(sum, u0);
   1248   in1 = _mm_unpackhi_epi32(sum, u0);
   1249 
   1250   sum = _mm_add_epi32(in0, in1);
   1251   in0 = _mm_srli_si128(sum, 8);
   1252 
   1253   in1 = _mm_add_epi32(sum, in0);
   1254   in1 = _mm_srai_epi32(in1, 1);
   1255   _mm_store_si128((__m128i *)(output), in1);
   1256 }
   1257 
   1258 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   1259   // The 2D transform is done with two passes which are actually pretty
   1260   // similar. In the first one, we transform the columns and transpose
   1261   // the results. In the second one, we transform the rows. To achieve that,
   1262   // as the first pass results are transposed, we transpose the columns (that
   1263   // is the transposed rows) and transpose the results (so that it goes back
   1264   // in normal/row positions).
   1265   int pass;
   1266   // We need an intermediate buffer between passes.
   1267   DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
   1268   const int16_t *in = input;
   1269   int16_t *out = intermediate;
   1270   // Constants
   1271   //    When we use them, in one case, they are all the same. In all others
   1272   //    it's a pair of them that we need to repeat four times. This is done
   1273   //    by constructing the 32 bit constant corresponding to that pair.
   1274   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   1275   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1276   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1277   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
   1278   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1279   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   1280   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   1281   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
   1282   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   1283   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
   1284   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
   1285   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
   1286   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
   1287   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
   1288   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
   1289   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
   1290   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
   1291   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1292   const __m128i kOne = _mm_set1_epi16(1);
   1293   // Do the two transform/transpose passes
   1294   for (pass = 0; pass < 2; ++pass) {
   1295     // We process eight columns (transposed rows in second pass) at a time.
   1296     int column_start;
   1297     for (column_start = 0; column_start < 16; column_start += 8) {
   1298       __m128i in00, in01, in02, in03, in04, in05, in06, in07;
   1299       __m128i in08, in09, in10, in11, in12, in13, in14, in15;
   1300       __m128i input0, input1, input2, input3, input4, input5, input6, input7;
   1301       __m128i step1_0, step1_1, step1_2, step1_3;
   1302       __m128i step1_4, step1_5, step1_6, step1_7;
   1303       __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
   1304       __m128i step3_0, step3_1, step3_2, step3_3;
   1305       __m128i step3_4, step3_5, step3_6, step3_7;
   1306       __m128i res00, res01, res02, res03, res04, res05, res06, res07;
   1307       __m128i res08, res09, res10, res11, res12, res13, res14, res15;
   1308       // Load and pre-condition input.
   1309       if (0 == pass) {
   1310         in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
   1311         in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
   1312         in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
   1313         in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
   1314         in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
   1315         in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
   1316         in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
   1317         in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
   1318         in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
   1319         in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
   1320         in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
   1321         in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
   1322         in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
   1323         in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
   1324         in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
   1325         in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
   1326         // x = x << 2
   1327         in00 = _mm_slli_epi16(in00, 2);
   1328         in01 = _mm_slli_epi16(in01, 2);
   1329         in02 = _mm_slli_epi16(in02, 2);
   1330         in03 = _mm_slli_epi16(in03, 2);
   1331         in04 = _mm_slli_epi16(in04, 2);
   1332         in05 = _mm_slli_epi16(in05, 2);
   1333         in06 = _mm_slli_epi16(in06, 2);
   1334         in07 = _mm_slli_epi16(in07, 2);
   1335         in08 = _mm_slli_epi16(in08, 2);
   1336         in09 = _mm_slli_epi16(in09, 2);
   1337         in10 = _mm_slli_epi16(in10, 2);
   1338         in11 = _mm_slli_epi16(in11, 2);
   1339         in12 = _mm_slli_epi16(in12, 2);
   1340         in13 = _mm_slli_epi16(in13, 2);
   1341         in14 = _mm_slli_epi16(in14, 2);
   1342         in15 = _mm_slli_epi16(in15, 2);
   1343       } else {
   1344         in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
   1345         in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
   1346         in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
   1347         in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
   1348         in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
   1349         in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
   1350         in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
   1351         in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
   1352         in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
   1353         in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
   1354         in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
   1355         in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
   1356         in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
   1357         in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
   1358         in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
   1359         in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
   1360         // x = (x + 1) >> 2
   1361         in00 = _mm_add_epi16(in00, kOne);
   1362         in01 = _mm_add_epi16(in01, kOne);
   1363         in02 = _mm_add_epi16(in02, kOne);
   1364         in03 = _mm_add_epi16(in03, kOne);
   1365         in04 = _mm_add_epi16(in04, kOne);
   1366         in05 = _mm_add_epi16(in05, kOne);
   1367         in06 = _mm_add_epi16(in06, kOne);
   1368         in07 = _mm_add_epi16(in07, kOne);
   1369         in08 = _mm_add_epi16(in08, kOne);
   1370         in09 = _mm_add_epi16(in09, kOne);
   1371         in10 = _mm_add_epi16(in10, kOne);
   1372         in11 = _mm_add_epi16(in11, kOne);
   1373         in12 = _mm_add_epi16(in12, kOne);
   1374         in13 = _mm_add_epi16(in13, kOne);
   1375         in14 = _mm_add_epi16(in14, kOne);
   1376         in15 = _mm_add_epi16(in15, kOne);
   1377         in00 = _mm_srai_epi16(in00, 2);
   1378         in01 = _mm_srai_epi16(in01, 2);
   1379         in02 = _mm_srai_epi16(in02, 2);
   1380         in03 = _mm_srai_epi16(in03, 2);
   1381         in04 = _mm_srai_epi16(in04, 2);
   1382         in05 = _mm_srai_epi16(in05, 2);
   1383         in06 = _mm_srai_epi16(in06, 2);
   1384         in07 = _mm_srai_epi16(in07, 2);
   1385         in08 = _mm_srai_epi16(in08, 2);
   1386         in09 = _mm_srai_epi16(in09, 2);
   1387         in10 = _mm_srai_epi16(in10, 2);
   1388         in11 = _mm_srai_epi16(in11, 2);
   1389         in12 = _mm_srai_epi16(in12, 2);
   1390         in13 = _mm_srai_epi16(in13, 2);
   1391         in14 = _mm_srai_epi16(in14, 2);
   1392         in15 = _mm_srai_epi16(in15, 2);
   1393       }
   1394       in += 8;
   1395       // Calculate input for the first 8 results.
   1396       {
   1397         input0 = _mm_add_epi16(in00, in15);
   1398         input1 = _mm_add_epi16(in01, in14);
   1399         input2 = _mm_add_epi16(in02, in13);
   1400         input3 = _mm_add_epi16(in03, in12);
   1401         input4 = _mm_add_epi16(in04, in11);
   1402         input5 = _mm_add_epi16(in05, in10);
   1403         input6 = _mm_add_epi16(in06, in09);
   1404         input7 = _mm_add_epi16(in07, in08);
   1405       }
   1406       // Calculate input for the next 8 results.
   1407       {
   1408         step1_0 = _mm_sub_epi16(in07, in08);
   1409         step1_1 = _mm_sub_epi16(in06, in09);
   1410         step1_2 = _mm_sub_epi16(in05, in10);
   1411         step1_3 = _mm_sub_epi16(in04, in11);
   1412         step1_4 = _mm_sub_epi16(in03, in12);
   1413         step1_5 = _mm_sub_epi16(in02, in13);
   1414         step1_6 = _mm_sub_epi16(in01, in14);
   1415         step1_7 = _mm_sub_epi16(in00, in15);
   1416       }
   1417       // Work on the first eight values; fdct8(input, even_results);
   1418       {
   1419         // Add/subtract
   1420         const __m128i q0 = _mm_add_epi16(input0, input7);
   1421         const __m128i q1 = _mm_add_epi16(input1, input6);
   1422         const __m128i q2 = _mm_add_epi16(input2, input5);
   1423         const __m128i q3 = _mm_add_epi16(input3, input4);
   1424         const __m128i q4 = _mm_sub_epi16(input3, input4);
   1425         const __m128i q5 = _mm_sub_epi16(input2, input5);
   1426         const __m128i q6 = _mm_sub_epi16(input1, input6);
   1427         const __m128i q7 = _mm_sub_epi16(input0, input7);
   1428         // Work on first four results
   1429         {
   1430           // Add/subtract
   1431           const __m128i r0 = _mm_add_epi16(q0, q3);
   1432           const __m128i r1 = _mm_add_epi16(q1, q2);
   1433           const __m128i r2 = _mm_sub_epi16(q1, q2);
   1434           const __m128i r3 = _mm_sub_epi16(q0, q3);
   1435           // Interleave to do the multiply by constants which gets us
   1436           // into 32 bits.
   1437           const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
   1438           const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
   1439           const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
   1440           const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
   1441           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
   1442           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
   1443           const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
   1444           const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
   1445           const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
   1446           const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
   1447           const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
   1448           const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
   1449           // dct_const_round_shift
   1450           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1451           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1452           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1453           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1454           const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
   1455           const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
   1456           const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
   1457           const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
   1458           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1459           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1460           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1461           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1462           const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
   1463           const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
   1464           const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
   1465           const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   1466           // Combine
   1467           res00 = _mm_packs_epi32(w0, w1);
   1468           res08 = _mm_packs_epi32(w2, w3);
   1469           res04 = _mm_packs_epi32(w4, w5);
   1470           res12 = _mm_packs_epi32(w6, w7);
   1471         }
   1472         // Work on next four results
   1473         {
   1474           // Interleave to do the multiply by constants which gets us
   1475           // into 32 bits.
   1476           const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
   1477           const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
   1478           const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
   1479           const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
   1480           const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
   1481           const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
   1482           // dct_const_round_shift
   1483           const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
   1484           const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
   1485           const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
   1486           const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
   1487           const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
   1488           const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
   1489           const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
   1490           const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
   1491           // Combine
   1492           const __m128i r0 = _mm_packs_epi32(s0, s1);
   1493           const __m128i r1 = _mm_packs_epi32(s2, s3);
   1494           // Add/subtract
   1495           const __m128i x0 = _mm_add_epi16(q4, r0);
   1496           const __m128i x1 = _mm_sub_epi16(q4, r0);
   1497           const __m128i x2 = _mm_sub_epi16(q7, r1);
   1498           const __m128i x3 = _mm_add_epi16(q7, r1);
   1499           // Interleave to do the multiply by constants which gets us
   1500           // into 32 bits.
   1501           const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
   1502           const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
   1503           const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
   1504           const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
   1505           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
   1506           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
   1507           const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
   1508           const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
   1509           const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
   1510           const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
   1511           const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
   1512           const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
   1513           // dct_const_round_shift
   1514           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1515           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1516           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1517           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1518           const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
   1519           const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
   1520           const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
   1521           const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
   1522           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1523           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1524           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1525           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1526           const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
   1527           const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
   1528           const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
   1529           const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   1530           // Combine
   1531           res02 = _mm_packs_epi32(w0, w1);
   1532           res14 = _mm_packs_epi32(w2, w3);
   1533           res10 = _mm_packs_epi32(w4, w5);
   1534           res06 = _mm_packs_epi32(w6, w7);
   1535         }
   1536       }
   1537       // Work on the next eight values; step1 -> odd_results
   1538       {
   1539         // step 2
   1540         {
   1541           const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
   1542           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
   1543           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
   1544           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
   1545           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
   1546           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
   1547           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
   1548           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
   1549           // dct_const_round_shift
   1550           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1551           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1552           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1553           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1554           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1555           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1556           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1557           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1558           // Combine
   1559           step2_2 = _mm_packs_epi32(w0, w1);
   1560           step2_3 = _mm_packs_epi32(w2, w3);
   1561         }
   1562         {
   1563           const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
   1564           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
   1565           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
   1566           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
   1567           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
   1568           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
   1569           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
   1570           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
   1571           // dct_const_round_shift
   1572           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1573           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1574           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1575           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1576           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1577           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1578           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1579           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1580           // Combine
   1581           step2_5 = _mm_packs_epi32(w0, w1);
   1582           step2_4 = _mm_packs_epi32(w2, w3);
   1583         }
   1584         // step 3
   1585         {
   1586           step3_0 = _mm_add_epi16(step1_0, step2_3);
   1587           step3_1 = _mm_add_epi16(step1_1, step2_2);
   1588           step3_2 = _mm_sub_epi16(step1_1, step2_2);
   1589           step3_3 = _mm_sub_epi16(step1_0, step2_3);
   1590           step3_4 = _mm_sub_epi16(step1_7, step2_4);
   1591           step3_5 = _mm_sub_epi16(step1_6, step2_5);
   1592           step3_6 = _mm_add_epi16(step1_6, step2_5);
   1593           step3_7 = _mm_add_epi16(step1_7, step2_4);
   1594         }
   1595         // step 4
   1596         {
   1597           const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
   1598           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
   1599           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
   1600           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
   1601           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
   1602           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
   1603           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
   1604           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
   1605           // dct_const_round_shift
   1606           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1607           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1608           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1609           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1610           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1611           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1612           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1613           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1614           // Combine
   1615           step2_1 = _mm_packs_epi32(w0, w1);
   1616           step2_2 = _mm_packs_epi32(w2, w3);
   1617         }
   1618         {
   1619           const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
   1620           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
   1621           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
   1622           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
   1623           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
   1624           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
   1625           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
   1626           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
   1627           // dct_const_round_shift
   1628           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1629           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1630           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1631           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1632           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1633           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1634           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1635           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1636           // Combine
   1637           step2_6 = _mm_packs_epi32(w0, w1);
   1638           step2_5 = _mm_packs_epi32(w2, w3);
   1639         }
   1640         // step 5
   1641         {
   1642           step1_0 = _mm_add_epi16(step3_0, step2_1);
   1643           step1_1 = _mm_sub_epi16(step3_0, step2_1);
   1644           step1_2 = _mm_add_epi16(step3_3, step2_2);
   1645           step1_3 = _mm_sub_epi16(step3_3, step2_2);
   1646           step1_4 = _mm_sub_epi16(step3_4, step2_5);
   1647           step1_5 = _mm_add_epi16(step3_4, step2_5);
   1648           step1_6 = _mm_sub_epi16(step3_7, step2_6);
   1649           step1_7 = _mm_add_epi16(step3_7, step2_6);
   1650         }
   1651         // step 6
   1652         {
   1653           const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
   1654           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
   1655           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
   1656           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
   1657           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
   1658           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
   1659           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
   1660           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
   1661           // dct_const_round_shift
   1662           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1663           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1664           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1665           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1666           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1667           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1668           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1669           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1670           // Combine
   1671           res01 = _mm_packs_epi32(w0, w1);
   1672           res09 = _mm_packs_epi32(w2, w3);
   1673         }
   1674         {
   1675           const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
   1676           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
   1677           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
   1678           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
   1679           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
   1680           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
   1681           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
   1682           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
   1683           // dct_const_round_shift
   1684           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1685           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1686           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1687           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1688           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1689           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1690           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1691           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1692           // Combine
   1693           res05 = _mm_packs_epi32(w0, w1);
   1694           res13 = _mm_packs_epi32(w2, w3);
   1695         }
   1696         {
   1697           const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
   1698           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
   1699           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
   1700           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
   1701           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
   1702           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
   1703           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
   1704           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
   1705           // dct_const_round_shift
   1706           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1707           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1708           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1709           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1710           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1711           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1712           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1713           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1714           // Combine
   1715           res11 = _mm_packs_epi32(w0, w1);
   1716           res03 = _mm_packs_epi32(w2, w3);
   1717         }
   1718         {
   1719           const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
   1720           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
   1721           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
   1722           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
   1723           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
   1724           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
   1725           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
   1726           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
   1727           // dct_const_round_shift
   1728           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
   1729           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
   1730           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
   1731           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
   1732           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
   1733           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
   1734           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
   1735           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
   1736           // Combine
   1737           res15 = _mm_packs_epi32(w0, w1);
   1738           res07 = _mm_packs_epi32(w2, w3);
   1739         }
   1740       }
   1741       // Transpose the results, do it as two 8x8 transposes.
   1742       {
   1743         // 00 01 02 03 04 05 06 07
   1744         // 10 11 12 13 14 15 16 17
   1745         // 20 21 22 23 24 25 26 27
   1746         // 30 31 32 33 34 35 36 37
   1747         // 40 41 42 43 44 45 46 47
   1748         // 50 51 52 53 54 55 56 57
   1749         // 60 61 62 63 64 65 66 67
   1750         // 70 71 72 73 74 75 76 77
   1751         const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
   1752         const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
   1753         const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
   1754         const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
   1755         const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
   1756         const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
   1757         const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
   1758         const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
   1759         // 00 10 01 11 02 12 03 13
   1760         // 20 30 21 31 22 32 23 33
   1761         // 04 14 05 15 06 16 07 17
   1762         // 24 34 25 35 26 36 27 37
   1763         // 40 50 41 51 42 52 43 53
   1764         // 60 70 61 71 62 72 63 73
   1765         // 54 54 55 55 56 56 57 57
   1766         // 64 74 65 75 66 76 67 77
   1767         const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
   1768         const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   1769         const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
   1770         const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
   1771         const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
   1772         const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
   1773         const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
   1774         const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
   1775         // 00 10 20 30 01 11 21 31
   1776         // 40 50 60 70 41 51 61 71
   1777         // 02 12 22 32 03 13 23 33
   1778         // 42 52 62 72 43 53 63 73
   1779         // 04 14 24 34 05 15 21 36
   1780         // 44 54 64 74 45 55 61 76
   1781         // 06 16 26 36 07 17 27 37
   1782         // 46 56 66 76 47 57 67 77
   1783         const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
   1784         const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
   1785         const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
   1786         const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
   1787         const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
   1788         const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
   1789         const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
   1790         const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
   1791         // 00 10 20 30 40 50 60 70
   1792         // 01 11 21 31 41 51 61 71
   1793         // 02 12 22 32 42 52 62 72
   1794         // 03 13 23 33 43 53 63 73
   1795         // 04 14 24 34 44 54 64 74
   1796         // 05 15 25 35 45 55 65 75
   1797         // 06 16 26 36 46 56 66 76
   1798         // 07 17 27 37 47 57 67 77
   1799         _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
   1800         _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
   1801         _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
   1802         _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
   1803         _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
   1804         _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
   1805         _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
   1806         _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
   1807       }
   1808       {
   1809         // 00 01 02 03 04 05 06 07
   1810         // 10 11 12 13 14 15 16 17
   1811         // 20 21 22 23 24 25 26 27
   1812         // 30 31 32 33 34 35 36 37
   1813         // 40 41 42 43 44 45 46 47
   1814         // 50 51 52 53 54 55 56 57
   1815         // 60 61 62 63 64 65 66 67
   1816         // 70 71 72 73 74 75 76 77
   1817         const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
   1818         const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
   1819         const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
   1820         const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
   1821         const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
   1822         const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
   1823         const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
   1824         const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
   1825         // 00 10 01 11 02 12 03 13
   1826         // 20 30 21 31 22 32 23 33
   1827         // 04 14 05 15 06 16 07 17
   1828         // 24 34 25 35 26 36 27 37
   1829         // 40 50 41 51 42 52 43 53
   1830         // 60 70 61 71 62 72 63 73
   1831         // 54 54 55 55 56 56 57 57
   1832         // 64 74 65 75 66 76 67 77
   1833         const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
   1834         const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   1835         const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
   1836         const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
   1837         const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
   1838         const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
   1839         const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
   1840         const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
   1841         // 00 10 20 30 01 11 21 31
   1842         // 40 50 60 70 41 51 61 71
   1843         // 02 12 22 32 03 13 23 33
   1844         // 42 52 62 72 43 53 63 73
   1845         // 04 14 24 34 05 15 21 36
   1846         // 44 54 64 74 45 55 61 76
   1847         // 06 16 26 36 07 17 27 37
   1848         // 46 56 66 76 47 57 67 77
   1849         const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
   1850         const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
   1851         const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
   1852         const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
   1853         const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
   1854         const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
   1855         const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
   1856         const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
   1857         // 00 10 20 30 40 50 60 70
   1858         // 01 11 21 31 41 51 61 71
   1859         // 02 12 22 32 42 52 62 72
   1860         // 03 13 23 33 43 53 63 73
   1861         // 04 14 24 34 44 54 64 74
   1862         // 05 15 25 35 45 55 65 75
   1863         // 06 16 26 36 46 56 66 76
   1864         // 07 17 27 37 47 57 67 77
   1865         // Store results
   1866         _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
   1867         _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
   1868         _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
   1869         _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
   1870         _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
   1871         _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
   1872         _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
   1873         _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
   1874       }
   1875       out += 8*16;
   1876     }
   1877     // Setup in/out for next pass.
   1878     in = intermediate;
   1879     out = output;
   1880   }
   1881 }
   1882 
   1883 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
   1884                                      __m128i *in1, int stride) {
   1885   // load first 8 columns
   1886   load_buffer_8x8(input, in0, stride);
   1887   load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
   1888 
   1889   input += 8;
   1890   // load second 8 columns
   1891   load_buffer_8x8(input, in1, stride);
   1892   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
   1893 }
   1894 
   1895 static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
   1896                                       __m128i *in1, int stride) {
   1897   // write first 8 columns
   1898   write_buffer_8x8(output, in0, stride);
   1899   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
   1900   // write second 8 columns
   1901   output += 8;
   1902   write_buffer_8x8(output, in1, stride);
   1903   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
   1904 }
   1905 
   1906 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   1907   // perform rounding operations
   1908   right_shift_8x8(res0, 2);
   1909   right_shift_8x8(res0 + 8, 2);
   1910   right_shift_8x8(res1, 2);
   1911   right_shift_8x8(res1 + 8, 2);
   1912 }
   1913 
   1914 void fdct16_8col(__m128i *in) {
   1915   // perform 16x16 1-D DCT for 8 columns
   1916   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   1917   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   1918   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   1919   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   1920   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   1921   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
   1922   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   1923   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   1924   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
   1925   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
   1926   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   1927   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
   1928   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
   1929   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
   1930   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
   1931   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
   1932   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
   1933   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
   1934   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
   1935   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   1936 
   1937   // stage 1
   1938   i[0] = _mm_add_epi16(in[0], in[15]);
   1939   i[1] = _mm_add_epi16(in[1], in[14]);
   1940   i[2] = _mm_add_epi16(in[2], in[13]);
   1941   i[3] = _mm_add_epi16(in[3], in[12]);
   1942   i[4] = _mm_add_epi16(in[4], in[11]);
   1943   i[5] = _mm_add_epi16(in[5], in[10]);
   1944   i[6] = _mm_add_epi16(in[6], in[9]);
   1945   i[7] = _mm_add_epi16(in[7], in[8]);
   1946 
   1947   s[0] = _mm_sub_epi16(in[7], in[8]);
   1948   s[1] = _mm_sub_epi16(in[6], in[9]);
   1949   s[2] = _mm_sub_epi16(in[5], in[10]);
   1950   s[3] = _mm_sub_epi16(in[4], in[11]);
   1951   s[4] = _mm_sub_epi16(in[3], in[12]);
   1952   s[5] = _mm_sub_epi16(in[2], in[13]);
   1953   s[6] = _mm_sub_epi16(in[1], in[14]);
   1954   s[7] = _mm_sub_epi16(in[0], in[15]);
   1955 
   1956   p[0] = _mm_add_epi16(i[0], i[7]);
   1957   p[1] = _mm_add_epi16(i[1], i[6]);
   1958   p[2] = _mm_add_epi16(i[2], i[5]);
   1959   p[3] = _mm_add_epi16(i[3], i[4]);
   1960   p[4] = _mm_sub_epi16(i[3], i[4]);
   1961   p[5] = _mm_sub_epi16(i[2], i[5]);
   1962   p[6] = _mm_sub_epi16(i[1], i[6]);
   1963   p[7] = _mm_sub_epi16(i[0], i[7]);
   1964 
   1965   u[0] = _mm_add_epi16(p[0], p[3]);
   1966   u[1] = _mm_add_epi16(p[1], p[2]);
   1967   u[2] = _mm_sub_epi16(p[1], p[2]);
   1968   u[3] = _mm_sub_epi16(p[0], p[3]);
   1969 
   1970   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
   1971   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
   1972   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
   1973   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
   1974 
   1975   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
   1976   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
   1977   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
   1978   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
   1979   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
   1980   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
   1981   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
   1982   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
   1983 
   1984   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   1985   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   1986   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   1987   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   1988   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   1989   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   1990   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   1991   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   1992 
   1993   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   1994   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   1995   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   1996   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   1997   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   1998   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   1999   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   2000   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   2001 
   2002   in[0] = _mm_packs_epi32(u[0], u[1]);
   2003   in[4] = _mm_packs_epi32(u[4], u[5]);
   2004   in[8] = _mm_packs_epi32(u[2], u[3]);
   2005   in[12] = _mm_packs_epi32(u[6], u[7]);
   2006 
   2007   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
   2008   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
   2009   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   2010   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   2011   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   2012   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   2013 
   2014   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2015   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2016   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2017   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2018 
   2019   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2020   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2021   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2022   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2023 
   2024   u[0] = _mm_packs_epi32(v[0], v[1]);
   2025   u[1] = _mm_packs_epi32(v[2], v[3]);
   2026 
   2027   t[0] = _mm_add_epi16(p[4], u[0]);
   2028   t[1] = _mm_sub_epi16(p[4], u[0]);
   2029   t[2] = _mm_sub_epi16(p[7], u[1]);
   2030   t[3] = _mm_add_epi16(p[7], u[1]);
   2031 
   2032   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
   2033   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
   2034   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
   2035   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
   2036 
   2037   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
   2038   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
   2039   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
   2040   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
   2041   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
   2042   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
   2043   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
   2044   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
   2045 
   2046   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2047   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2048   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2049   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2050   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2051   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2052   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2053   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2054 
   2055   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2056   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2057   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2058   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2059   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2060   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2061   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2062   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2063 
   2064   in[2] = _mm_packs_epi32(v[0], v[1]);
   2065   in[6] = _mm_packs_epi32(v[4], v[5]);
   2066   in[10] = _mm_packs_epi32(v[2], v[3]);
   2067   in[14] = _mm_packs_epi32(v[6], v[7]);
   2068 
   2069   // stage 2
   2070   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
   2071   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
   2072   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
   2073   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
   2074 
   2075   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
   2076   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
   2077   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   2078   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   2079   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   2080   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   2081   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   2082   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
   2083 
   2084   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2085   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2086   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2087   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2088   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2089   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2090   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2091   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2092 
   2093   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2094   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2095   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2096   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2097   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2098   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2099   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2100   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2101 
   2102   t[2] = _mm_packs_epi32(v[0], v[1]);
   2103   t[3] = _mm_packs_epi32(v[2], v[3]);
   2104   t[4] = _mm_packs_epi32(v[4], v[5]);
   2105   t[5] = _mm_packs_epi32(v[6], v[7]);
   2106 
   2107   // stage 3
   2108   p[0] = _mm_add_epi16(s[0], t[3]);
   2109   p[1] = _mm_add_epi16(s[1], t[2]);
   2110   p[2] = _mm_sub_epi16(s[1], t[2]);
   2111   p[3] = _mm_sub_epi16(s[0], t[3]);
   2112   p[4] = _mm_sub_epi16(s[7], t[4]);
   2113   p[5] = _mm_sub_epi16(s[6], t[5]);
   2114   p[6] = _mm_add_epi16(s[6], t[5]);
   2115   p[7] = _mm_add_epi16(s[7], t[4]);
   2116 
   2117   // stage 4
   2118   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
   2119   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
   2120   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
   2121   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
   2122 
   2123   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
   2124   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
   2125   v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
   2126   v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
   2127   v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
   2128   v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
   2129   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
   2130   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
   2131 
   2132   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2133   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2134   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2135   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2136   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2137   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2138   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2139   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2140 
   2141   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2142   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2143   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2144   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2145   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2146   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2147   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2148   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2149 
   2150   t[1] = _mm_packs_epi32(v[0], v[1]);
   2151   t[2] = _mm_packs_epi32(v[2], v[3]);
   2152   t[5] = _mm_packs_epi32(v[4], v[5]);
   2153   t[6] = _mm_packs_epi32(v[6], v[7]);
   2154 
   2155   // stage 5
   2156   s[0] = _mm_add_epi16(p[0], t[1]);
   2157   s[1] = _mm_sub_epi16(p[0], t[1]);
   2158   s[2] = _mm_add_epi16(p[3], t[2]);
   2159   s[3] = _mm_sub_epi16(p[3], t[2]);
   2160   s[4] = _mm_sub_epi16(p[4], t[5]);
   2161   s[5] = _mm_add_epi16(p[4], t[5]);
   2162   s[6] = _mm_sub_epi16(p[7], t[6]);
   2163   s[7] = _mm_add_epi16(p[7], t[6]);
   2164 
   2165   // stage 6
   2166   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
   2167   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
   2168   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
   2169   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
   2170   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
   2171   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
   2172   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
   2173   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
   2174 
   2175   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
   2176   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
   2177   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
   2178   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
   2179   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
   2180   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
   2181   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
   2182   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
   2183   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
   2184   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
   2185   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
   2186   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
   2187   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
   2188   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
   2189   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
   2190   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
   2191 
   2192   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2193   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2194   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2195   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2196   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2197   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2198   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2199   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2200   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2201   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2202   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2203   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2204   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2205   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2206   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2207   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2208 
   2209   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2210   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2211   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2212   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2213   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2214   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2215   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2216   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2217   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2218   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2219   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2220   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2221   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2222   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2223   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2224   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2225 
   2226   in[1]  = _mm_packs_epi32(v[0], v[1]);
   2227   in[9]  = _mm_packs_epi32(v[2], v[3]);
   2228   in[5]  = _mm_packs_epi32(v[4], v[5]);
   2229   in[13] = _mm_packs_epi32(v[6], v[7]);
   2230   in[3]  = _mm_packs_epi32(v[8], v[9]);
   2231   in[11] = _mm_packs_epi32(v[10], v[11]);
   2232   in[7]  = _mm_packs_epi32(v[12], v[13]);
   2233   in[15] = _mm_packs_epi32(v[14], v[15]);
   2234 }
   2235 
   2236 void fadst16_8col(__m128i *in) {
   2237   // perform 16x16 1-D ADST for 8 columns
   2238   __m128i s[16], x[16], u[32], v[32];
   2239   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
   2240   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
   2241   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
   2242   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
   2243   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
   2244   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
   2245   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
   2246   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
   2247   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
   2248   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
   2249   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
   2250   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
   2251   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
   2252   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
   2253   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
   2254   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
   2255   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   2256   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   2257   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
   2258   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   2259   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
   2260   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
   2261   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   2262   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   2263   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   2264   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
   2265   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   2266   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   2267   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   2268   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   2269   const __m128i kZero = _mm_set1_epi16(0);
   2270 
   2271   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   2272   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
   2273   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
   2274   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
   2275   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
   2276   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
   2277   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
   2278   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
   2279   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
   2280   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
   2281   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
   2282   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
   2283   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
   2284   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
   2285   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
   2286   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
   2287 
   2288   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
   2289   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
   2290   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
   2291   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
   2292   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
   2293   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
   2294   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
   2295   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
   2296   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
   2297   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
   2298   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
   2299   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
   2300   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
   2301   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
   2302   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
   2303   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
   2304   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
   2305   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
   2306   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
   2307   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
   2308   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
   2309   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
   2310   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
   2311   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
   2312   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
   2313   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
   2314   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
   2315   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
   2316   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
   2317   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
   2318   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
   2319   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
   2320 
   2321   u[0] = _mm_add_epi32(v[0], v[16]);
   2322   u[1] = _mm_add_epi32(v[1], v[17]);
   2323   u[2] = _mm_add_epi32(v[2], v[18]);
   2324   u[3] = _mm_add_epi32(v[3], v[19]);
   2325   u[4] = _mm_add_epi32(v[4], v[20]);
   2326   u[5] = _mm_add_epi32(v[5], v[21]);
   2327   u[6] = _mm_add_epi32(v[6], v[22]);
   2328   u[7] = _mm_add_epi32(v[7], v[23]);
   2329   u[8] = _mm_add_epi32(v[8], v[24]);
   2330   u[9] = _mm_add_epi32(v[9], v[25]);
   2331   u[10] = _mm_add_epi32(v[10], v[26]);
   2332   u[11] = _mm_add_epi32(v[11], v[27]);
   2333   u[12] = _mm_add_epi32(v[12], v[28]);
   2334   u[13] = _mm_add_epi32(v[13], v[29]);
   2335   u[14] = _mm_add_epi32(v[14], v[30]);
   2336   u[15] = _mm_add_epi32(v[15], v[31]);
   2337   u[16] = _mm_sub_epi32(v[0], v[16]);
   2338   u[17] = _mm_sub_epi32(v[1], v[17]);
   2339   u[18] = _mm_sub_epi32(v[2], v[18]);
   2340   u[19] = _mm_sub_epi32(v[3], v[19]);
   2341   u[20] = _mm_sub_epi32(v[4], v[20]);
   2342   u[21] = _mm_sub_epi32(v[5], v[21]);
   2343   u[22] = _mm_sub_epi32(v[6], v[22]);
   2344   u[23] = _mm_sub_epi32(v[7], v[23]);
   2345   u[24] = _mm_sub_epi32(v[8], v[24]);
   2346   u[25] = _mm_sub_epi32(v[9], v[25]);
   2347   u[26] = _mm_sub_epi32(v[10], v[26]);
   2348   u[27] = _mm_sub_epi32(v[11], v[27]);
   2349   u[28] = _mm_sub_epi32(v[12], v[28]);
   2350   u[29] = _mm_sub_epi32(v[13], v[29]);
   2351   u[30] = _mm_sub_epi32(v[14], v[30]);
   2352   u[31] = _mm_sub_epi32(v[15], v[31]);
   2353 
   2354   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   2355   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   2356   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   2357   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   2358   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   2359   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   2360   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   2361   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   2362   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   2363   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   2364   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   2365   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   2366   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   2367   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   2368   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   2369   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   2370   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
   2371   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
   2372   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
   2373   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
   2374   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
   2375   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
   2376   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
   2377   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
   2378   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
   2379   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
   2380   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
   2381   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
   2382   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
   2383   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
   2384   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
   2385   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
   2386 
   2387   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   2388   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   2389   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   2390   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   2391   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   2392   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   2393   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   2394   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   2395   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   2396   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   2397   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   2398   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   2399   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   2400   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   2401   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   2402   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   2403   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
   2404   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
   2405   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
   2406   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
   2407   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
   2408   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
   2409   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
   2410   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
   2411   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
   2412   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
   2413   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
   2414   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
   2415   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
   2416   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
   2417   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
   2418   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
   2419 
   2420   s[0] = _mm_packs_epi32(u[0], u[1]);
   2421   s[1] = _mm_packs_epi32(u[2], u[3]);
   2422   s[2] = _mm_packs_epi32(u[4], u[5]);
   2423   s[3] = _mm_packs_epi32(u[6], u[7]);
   2424   s[4] = _mm_packs_epi32(u[8], u[9]);
   2425   s[5] = _mm_packs_epi32(u[10], u[11]);
   2426   s[6] = _mm_packs_epi32(u[12], u[13]);
   2427   s[7] = _mm_packs_epi32(u[14], u[15]);
   2428   s[8] = _mm_packs_epi32(u[16], u[17]);
   2429   s[9] = _mm_packs_epi32(u[18], u[19]);
   2430   s[10] = _mm_packs_epi32(u[20], u[21]);
   2431   s[11] = _mm_packs_epi32(u[22], u[23]);
   2432   s[12] = _mm_packs_epi32(u[24], u[25]);
   2433   s[13] = _mm_packs_epi32(u[26], u[27]);
   2434   s[14] = _mm_packs_epi32(u[28], u[29]);
   2435   s[15] = _mm_packs_epi32(u[30], u[31]);
   2436 
   2437   // stage 2
   2438   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
   2439   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
   2440   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
   2441   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
   2442   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
   2443   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
   2444   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   2445   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   2446 
   2447   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
   2448   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
   2449   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
   2450   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
   2451   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
   2452   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
   2453   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
   2454   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
   2455   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
   2456   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
   2457   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
   2458   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
   2459   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
   2460   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
   2461   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
   2462   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
   2463 
   2464   u[0] = _mm_add_epi32(v[0], v[8]);
   2465   u[1] = _mm_add_epi32(v[1], v[9]);
   2466   u[2] = _mm_add_epi32(v[2], v[10]);
   2467   u[3] = _mm_add_epi32(v[3], v[11]);
   2468   u[4] = _mm_add_epi32(v[4], v[12]);
   2469   u[5] = _mm_add_epi32(v[5], v[13]);
   2470   u[6] = _mm_add_epi32(v[6], v[14]);
   2471   u[7] = _mm_add_epi32(v[7], v[15]);
   2472   u[8] = _mm_sub_epi32(v[0], v[8]);
   2473   u[9] = _mm_sub_epi32(v[1], v[9]);
   2474   u[10] = _mm_sub_epi32(v[2], v[10]);
   2475   u[11] = _mm_sub_epi32(v[3], v[11]);
   2476   u[12] = _mm_sub_epi32(v[4], v[12]);
   2477   u[13] = _mm_sub_epi32(v[5], v[13]);
   2478   u[14] = _mm_sub_epi32(v[6], v[14]);
   2479   u[15] = _mm_sub_epi32(v[7], v[15]);
   2480 
   2481   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   2482   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   2483   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   2484   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   2485   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   2486   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   2487   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   2488   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   2489   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   2490   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   2491   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   2492   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   2493   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   2494   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   2495   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   2496   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   2497 
   2498   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
   2499   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
   2500   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   2501   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
   2502   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
   2503   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
   2504   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
   2505   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   2506   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   2507   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   2508   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
   2509   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
   2510   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
   2511   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
   2512   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   2513   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   2514 
   2515   x[0] = _mm_add_epi16(s[0], s[4]);
   2516   x[1] = _mm_add_epi16(s[1], s[5]);
   2517   x[2] = _mm_add_epi16(s[2], s[6]);
   2518   x[3] = _mm_add_epi16(s[3], s[7]);
   2519   x[4] = _mm_sub_epi16(s[0], s[4]);
   2520   x[5] = _mm_sub_epi16(s[1], s[5]);
   2521   x[6] = _mm_sub_epi16(s[2], s[6]);
   2522   x[7] = _mm_sub_epi16(s[3], s[7]);
   2523   x[8] = _mm_packs_epi32(u[0], u[1]);
   2524   x[9] = _mm_packs_epi32(u[2], u[3]);
   2525   x[10] = _mm_packs_epi32(u[4], u[5]);
   2526   x[11] = _mm_packs_epi32(u[6], u[7]);
   2527   x[12] = _mm_packs_epi32(u[8], u[9]);
   2528   x[13] = _mm_packs_epi32(u[10], u[11]);
   2529   x[14] = _mm_packs_epi32(u[12], u[13]);
   2530   x[15] = _mm_packs_epi32(u[14], u[15]);
   2531 
   2532   // stage 3
   2533   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
   2534   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
   2535   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
   2536   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
   2537   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
   2538   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
   2539   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
   2540   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
   2541 
   2542   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
   2543   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
   2544   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
   2545   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
   2546   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
   2547   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
   2548   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
   2549   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
   2550   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
   2551   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
   2552   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
   2553   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
   2554   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
   2555   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
   2556   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
   2557   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
   2558 
   2559   u[0] = _mm_add_epi32(v[0], v[4]);
   2560   u[1] = _mm_add_epi32(v[1], v[5]);
   2561   u[2] = _mm_add_epi32(v[2], v[6]);
   2562   u[3] = _mm_add_epi32(v[3], v[7]);
   2563   u[4] = _mm_sub_epi32(v[0], v[4]);
   2564   u[5] = _mm_sub_epi32(v[1], v[5]);
   2565   u[6] = _mm_sub_epi32(v[2], v[6]);
   2566   u[7] = _mm_sub_epi32(v[3], v[7]);
   2567   u[8] = _mm_add_epi32(v[8], v[12]);
   2568   u[9] = _mm_add_epi32(v[9], v[13]);
   2569   u[10] = _mm_add_epi32(v[10], v[14]);
   2570   u[11] = _mm_add_epi32(v[11], v[15]);
   2571   u[12] = _mm_sub_epi32(v[8], v[12]);
   2572   u[13] = _mm_sub_epi32(v[9], v[13]);
   2573   u[14] = _mm_sub_epi32(v[10], v[14]);
   2574   u[15] = _mm_sub_epi32(v[11], v[15]);
   2575 
   2576   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   2577   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
   2578   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
   2579   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
   2580   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
   2581   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
   2582   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
   2583   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   2584   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   2585   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   2586   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
   2587   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
   2588   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
   2589   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
   2590   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   2591   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   2592 
   2593   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2594   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2595   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2596   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2597   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2598   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2599   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2600   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2601   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2602   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2603   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2604   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2605   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2606   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2607   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2608   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2609 
   2610   s[0] = _mm_add_epi16(x[0], x[2]);
   2611   s[1] = _mm_add_epi16(x[1], x[3]);
   2612   s[2] = _mm_sub_epi16(x[0], x[2]);
   2613   s[3] = _mm_sub_epi16(x[1], x[3]);
   2614   s[4] = _mm_packs_epi32(v[0], v[1]);
   2615   s[5] = _mm_packs_epi32(v[2], v[3]);
   2616   s[6] = _mm_packs_epi32(v[4], v[5]);
   2617   s[7] = _mm_packs_epi32(v[6], v[7]);
   2618   s[8] = _mm_add_epi16(x[8], x[10]);
   2619   s[9] = _mm_add_epi16(x[9], x[11]);
   2620   s[10] = _mm_sub_epi16(x[8], x[10]);
   2621   s[11] = _mm_sub_epi16(x[9], x[11]);
   2622   s[12] = _mm_packs_epi32(v[8], v[9]);
   2623   s[13] = _mm_packs_epi32(v[10], v[11]);
   2624   s[14] = _mm_packs_epi32(v[12], v[13]);
   2625   s[15] = _mm_packs_epi32(v[14], v[15]);
   2626 
   2627   // stage 4
   2628   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
   2629   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
   2630   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
   2631   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
   2632   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
   2633   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
   2634   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   2635   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
   2636 
   2637   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
   2638   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
   2639   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   2640   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
   2641   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
   2642   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
   2643   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
   2644   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
   2645   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
   2646   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
   2647   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
   2648   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
   2649   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
   2650   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
   2651   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
   2652   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
   2653 
   2654   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
   2655   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
   2656   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
   2657   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
   2658   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
   2659   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
   2660   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
   2661   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
   2662   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
   2663   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
   2664   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
   2665   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
   2666   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
   2667   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
   2668   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
   2669   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
   2670 
   2671   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
   2672   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
   2673   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   2674   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
   2675   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
   2676   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
   2677   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
   2678   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
   2679   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
   2680   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
   2681   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
   2682   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
   2683   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
   2684   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
   2685   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   2686   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
   2687 
   2688   in[0] = s[0];
   2689   in[1] = _mm_sub_epi16(kZero, s[8]);
   2690   in[2] = s[12];
   2691   in[3] = _mm_sub_epi16(kZero, s[4]);
   2692   in[4] = _mm_packs_epi32(v[4], v[5]);
   2693   in[5] = _mm_packs_epi32(v[12], v[13]);
   2694   in[6] = _mm_packs_epi32(v[8], v[9]);
   2695   in[7] = _mm_packs_epi32(v[0], v[1]);
   2696   in[8] = _mm_packs_epi32(v[2], v[3]);
   2697   in[9] = _mm_packs_epi32(v[10], v[11]);
   2698   in[10] = _mm_packs_epi32(v[14], v[15]);
   2699   in[11] = _mm_packs_epi32(v[6], v[7]);
   2700   in[12] = s[5];
   2701   in[13] = _mm_sub_epi16(kZero, s[13]);
   2702   in[14] = s[9];
   2703   in[15] = _mm_sub_epi16(kZero, s[1]);
   2704 }
   2705 
   2706 void fdct16_sse2(__m128i *in0, __m128i *in1) {
   2707   fdct16_8col(in0);
   2708   fdct16_8col(in1);
   2709   array_transpose_16x16(in0, in1);
   2710 }
   2711 
   2712 void fadst16_sse2(__m128i *in0, __m128i *in1) {
   2713   fadst16_8col(in0);
   2714   fadst16_8col(in1);
   2715   array_transpose_16x16(in0, in1);
   2716 }
   2717 
   2718 void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
   2719                        int stride, int tx_type) {
   2720   __m128i in0[16], in1[16];
   2721 
   2722   switch (tx_type) {
   2723     case DCT_DCT:
   2724       vp9_fdct16x16_sse2(input, output, stride);
   2725       break;
   2726     case ADST_DCT:
   2727       load_buffer_16x16(input, in0, in1, stride);
   2728       fadst16_sse2(in0, in1);
   2729       right_shift_16x16(in0, in1);
   2730       fdct16_sse2(in0, in1);
   2731       write_buffer_16x16(output, in0, in1, 16);
   2732       break;
   2733     case DCT_ADST:
   2734       load_buffer_16x16(input, in0, in1, stride);
   2735       fdct16_sse2(in0, in1);
   2736       right_shift_16x16(in0, in1);
   2737       fadst16_sse2(in0, in1);
   2738       write_buffer_16x16(output, in0, in1, 16);
   2739       break;
   2740     case ADST_ADST:
   2741       load_buffer_16x16(input, in0, in1, stride);
   2742       fadst16_sse2(in0, in1);
   2743       right_shift_16x16(in0, in1);
   2744       fadst16_sse2(in0, in1);
   2745       write_buffer_16x16(output, in0, in1, 16);
   2746       break;
   2747     default:
   2748       assert(0);
   2749       break;
   2750   }
   2751 }
   2752 
   2753 void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
   2754   __m128i in0, in1, in2, in3;
   2755   __m128i u0, u1;
   2756   __m128i sum = _mm_setzero_si128();
   2757   int i;
   2758 
   2759   for (i = 0; i < 8; ++i) {
   2760     in0  = _mm_load_si128((const __m128i *)(input +  0));
   2761     in1  = _mm_load_si128((const __m128i *)(input +  8));
   2762     in2  = _mm_load_si128((const __m128i *)(input + 16));
   2763     in3  = _mm_load_si128((const __m128i *)(input + 24));
   2764 
   2765     input += stride;
   2766     u0 = _mm_add_epi16(in0, in1);
   2767     u1 = _mm_add_epi16(in2, in3);
   2768     sum = _mm_add_epi16(sum, u0);
   2769 
   2770     in0  = _mm_load_si128((const __m128i *)(input +  0));
   2771     in1  = _mm_load_si128((const __m128i *)(input +  8));
   2772     in2  = _mm_load_si128((const __m128i *)(input + 16));
   2773     in3  = _mm_load_si128((const __m128i *)(input + 24));
   2774 
   2775     input += stride;
   2776     sum = _mm_add_epi16(sum, u1);
   2777     u0  = _mm_add_epi16(in0, in1);
   2778     u1  = _mm_add_epi16(in2, in3);
   2779     sum = _mm_add_epi16(sum, u0);
   2780 
   2781     in0  = _mm_load_si128((const __m128i *)(input +  0));
   2782     in1  = _mm_load_si128((const __m128i *)(input +  8));
   2783     in2  = _mm_load_si128((const __m128i *)(input + 16));
   2784     in3  = _mm_load_si128((const __m128i *)(input + 24));
   2785 
   2786     input += stride;
   2787     sum = _mm_add_epi16(sum, u1);
   2788     u0  = _mm_add_epi16(in0, in1);
   2789     u1  = _mm_add_epi16(in2, in3);
   2790     sum = _mm_add_epi16(sum, u0);
   2791 
   2792     in0  = _mm_load_si128((const __m128i *)(input +  0));
   2793     in1  = _mm_load_si128((const __m128i *)(input +  8));
   2794     in2  = _mm_load_si128((const __m128i *)(input + 16));
   2795     in3  = _mm_load_si128((const __m128i *)(input + 24));
   2796 
   2797     input += stride;
   2798     sum = _mm_add_epi16(sum, u1);
   2799     u0  = _mm_add_epi16(in0, in1);
   2800     u1  = _mm_add_epi16(in2, in3);
   2801     sum = _mm_add_epi16(sum, u0);
   2802 
   2803     sum = _mm_add_epi16(sum, u1);
   2804   }
   2805 
   2806   u0  = _mm_setzero_si128();
   2807   in0 = _mm_unpacklo_epi16(u0, sum);
   2808   in1 = _mm_unpackhi_epi16(u0, sum);
   2809   in0 = _mm_srai_epi32(in0, 16);
   2810   in1 = _mm_srai_epi32(in1, 16);
   2811 
   2812   sum = _mm_add_epi32(in0, in1);
   2813   in0 = _mm_unpacklo_epi32(sum, u0);
   2814   in1 = _mm_unpackhi_epi32(sum, u0);
   2815 
   2816   sum = _mm_add_epi32(in0, in1);
   2817   in0 = _mm_srli_si128(sum, 8);
   2818 
   2819   in1 = _mm_add_epi32(sum, in0);
   2820   in1 = _mm_srai_epi32(in1, 3);
   2821   _mm_store_si128((__m128i *)(output), in1);
   2822 }
   2823 
   2824 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
   2825 #define FDCT32x32_HIGH_PRECISION 0
   2826 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
   2827 #undef  FDCT32x32_HIGH_PRECISION
   2828 #undef  FDCT32x32_2D
   2829 
   2830 #define FDCT32x32_2D vp9_fdct32x32_sse2
   2831 #define FDCT32x32_HIGH_PRECISION 1
   2832 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
   2833 #undef  FDCT32x32_HIGH_PRECISION
   2834 #undef  FDCT32x32_2D
   2835