Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <math.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 
     17 #include "vp9/common/vp9_blockd.h"
     18 #include "vp9/common/vp9_idct.h"
     19 #include "vp9/common/vp9_systemdependent.h"
     20 
     21 #include "vp9/encoder/vp9_dct.h"
     22 
     23 static void fdct4(const int16_t *input, int16_t *output) {
     24   int16_t step[4];
     25   int temp1, temp2;
     26 
     27   step[0] = input[0] + input[3];
     28   step[1] = input[1] + input[2];
     29   step[2] = input[1] - input[2];
     30   step[3] = input[0] - input[3];
     31 
     32   temp1 = (step[0] + step[1]) * cospi_16_64;
     33   temp2 = (step[0] - step[1]) * cospi_16_64;
     34   output[0] = dct_const_round_shift(temp1);
     35   output[2] = dct_const_round_shift(temp2);
     36   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     37   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
     38   output[1] = dct_const_round_shift(temp1);
     39   output[3] = dct_const_round_shift(temp2);
     40 }
     41 
     42 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
     43   // The 2D transform is done with two passes which are actually pretty
     44   // similar. In the first one, we transform the columns and transpose
     45   // the results. In the second one, we transform the rows. To achieve that,
     46   // as the first pass results are transposed, we tranpose the columns (that
     47   // is the transposed rows) and transpose the results (so that it goes back
     48   // in normal/row positions).
     49   int pass;
     50   // We need an intermediate buffer between passes.
     51   int16_t intermediate[4 * 4];
     52   const int16_t *in = input;
     53   int16_t *out = intermediate;
     54   // Do the two transform/transpose passes
     55   for (pass = 0; pass < 2; ++pass) {
     56     /*canbe16*/ int input[4];
     57     /*canbe16*/ int step[4];
     58     /*needs32*/ int temp1, temp2;
     59     int i;
     60     for (i = 0; i < 4; ++i) {
     61       // Load inputs.
     62       if (0 == pass) {
     63         input[0] = in[0 * stride] * 16;
     64         input[1] = in[1 * stride] * 16;
     65         input[2] = in[2 * stride] * 16;
     66         input[3] = in[3 * stride] * 16;
     67         if (i == 0 && input[0]) {
     68           input[0] += 1;
     69         }
     70       } else {
     71         input[0] = in[0 * 4];
     72         input[1] = in[1 * 4];
     73         input[2] = in[2 * 4];
     74         input[3] = in[3 * 4];
     75       }
     76       // Transform.
     77       step[0] = input[0] + input[3];
     78       step[1] = input[1] + input[2];
     79       step[2] = input[1] - input[2];
     80       step[3] = input[0] - input[3];
     81       temp1 = (step[0] + step[1]) * cospi_16_64;
     82       temp2 = (step[0] - step[1]) * cospi_16_64;
     83       out[0] = dct_const_round_shift(temp1);
     84       out[2] = dct_const_round_shift(temp2);
     85       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     86       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
     87       out[1] = dct_const_round_shift(temp1);
     88       out[3] = dct_const_round_shift(temp2);
     89       // Do next column (which is a transposed row in second/horizontal pass)
     90       in++;
     91       out += 4;
     92     }
     93     // Setup in/out for next pass.
     94     in = intermediate;
     95     out = output;
     96   }
     97 
     98   {
     99     int i, j;
    100     for (i = 0; i < 4; ++i) {
    101       for (j = 0; j < 4; ++j)
    102         output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
    103     }
    104   }
    105 }
    106 
    107 static void fadst4(const int16_t *input, int16_t *output) {
    108   int x0, x1, x2, x3;
    109   int s0, s1, s2, s3, s4, s5, s6, s7;
    110 
    111   x0 = input[0];
    112   x1 = input[1];
    113   x2 = input[2];
    114   x3 = input[3];
    115 
    116   if (!(x0 | x1 | x2 | x3)) {
    117     output[0] = output[1] = output[2] = output[3] = 0;
    118     return;
    119   }
    120 
    121   s0 = sinpi_1_9 * x0;
    122   s1 = sinpi_4_9 * x0;
    123   s2 = sinpi_2_9 * x1;
    124   s3 = sinpi_1_9 * x1;
    125   s4 = sinpi_3_9 * x2;
    126   s5 = sinpi_4_9 * x3;
    127   s6 = sinpi_2_9 * x3;
    128   s7 = x0 + x1 - x3;
    129 
    130   x0 = s0 + s2 + s5;
    131   x1 = sinpi_3_9 * s7;
    132   x2 = s1 - s3 + s6;
    133   x3 = s4;
    134 
    135   s0 = x0 + x3;
    136   s1 = x1;
    137   s2 = x2 - x3;
    138   s3 = x2 - x0 + x3;
    139 
    140   // 1-D transform scaling factor is sqrt(2).
    141   output[0] = dct_const_round_shift(s0);
    142   output[1] = dct_const_round_shift(s1);
    143   output[2] = dct_const_round_shift(s2);
    144   output[3] = dct_const_round_shift(s3);
    145 }
    146 
    147 static const transform_2d FHT_4[] = {
    148   { fdct4,  fdct4  },  // DCT_DCT  = 0
    149   { fadst4, fdct4  },  // ADST_DCT = 1
    150   { fdct4,  fadst4 },  // DCT_ADST = 2
    151   { fadst4, fadst4 }   // ADST_ADST = 3
    152 };
    153 
    154 void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
    155                         int stride, int tx_type) {
    156   int16_t out[4 * 4];
    157   int16_t *outptr = &out[0];
    158   int i, j;
    159   int16_t temp_in[4], temp_out[4];
    160   const transform_2d ht = FHT_4[tx_type];
    161 
    162   // Columns
    163   for (i = 0; i < 4; ++i) {
    164     for (j = 0; j < 4; ++j)
    165       temp_in[j] = input[j * stride + i] * 16;
    166     if (i == 0 && temp_in[0])
    167       temp_in[0] += 1;
    168     ht.cols(temp_in, temp_out);
    169     for (j = 0; j < 4; ++j)
    170       outptr[j * 4 + i] = temp_out[j];
    171   }
    172 
    173   // Rows
    174   for (i = 0; i < 4; ++i) {
    175     for (j = 0; j < 4; ++j)
    176       temp_in[j] = out[j + i * 4];
    177     ht.rows(temp_in, temp_out);
    178     for (j = 0; j < 4; ++j)
    179       output[j + i * 4] = (temp_out[j] + 1) >> 2;
    180   }
    181 }
    182 
    183 static void fdct8(const int16_t *input, int16_t *output) {
    184   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
    185   /*needs32*/ int t0, t1, t2, t3;
    186   /*canbe16*/ int x0, x1, x2, x3;
    187 
    188   // stage 1
    189   s0 = input[0] + input[7];
    190   s1 = input[1] + input[6];
    191   s2 = input[2] + input[5];
    192   s3 = input[3] + input[4];
    193   s4 = input[3] - input[4];
    194   s5 = input[2] - input[5];
    195   s6 = input[1] - input[6];
    196   s7 = input[0] - input[7];
    197 
    198   // fdct4(step, step);
    199   x0 = s0 + s3;
    200   x1 = s1 + s2;
    201   x2 = s1 - s2;
    202   x3 = s0 - s3;
    203   t0 = (x0 + x1) * cospi_16_64;
    204   t1 = (x0 - x1) * cospi_16_64;
    205   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
    206   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
    207   output[0] = dct_const_round_shift(t0);
    208   output[2] = dct_const_round_shift(t2);
    209   output[4] = dct_const_round_shift(t1);
    210   output[6] = dct_const_round_shift(t3);
    211 
    212   // Stage 2
    213   t0 = (s6 - s5) * cospi_16_64;
    214   t1 = (s6 + s5) * cospi_16_64;
    215   t2 = dct_const_round_shift(t0);
    216   t3 = dct_const_round_shift(t1);
    217 
    218   // Stage 3
    219   x0 = s4 + t2;
    220   x1 = s4 - t2;
    221   x2 = s7 - t3;
    222   x3 = s7 + t3;
    223 
    224   // Stage 4
    225   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    226   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    227   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    228   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    229   output[1] = dct_const_round_shift(t0);
    230   output[3] = dct_const_round_shift(t2);
    231   output[5] = dct_const_round_shift(t1);
    232   output[7] = dct_const_round_shift(t3);
    233 }
    234 
    235 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
    236   int i, j;
    237   int16_t intermediate[64];
    238 
    239   // Transform columns
    240   {
    241     int16_t *output = intermediate;
    242     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
    243     /*needs32*/ int t0, t1, t2, t3;
    244     /*canbe16*/ int x0, x1, x2, x3;
    245 
    246     int i;
    247     for (i = 0; i < 8; i++) {
    248       // stage 1
    249       s0 = (input[0 * stride] + input[7 * stride]) * 4;
    250       s1 = (input[1 * stride] + input[6 * stride]) * 4;
    251       s2 = (input[2 * stride] + input[5 * stride]) * 4;
    252       s3 = (input[3 * stride] + input[4 * stride]) * 4;
    253       s4 = (input[3 * stride] - input[4 * stride]) * 4;
    254       s5 = (input[2 * stride] - input[5 * stride]) * 4;
    255       s6 = (input[1 * stride] - input[6 * stride]) * 4;
    256       s7 = (input[0 * stride] - input[7 * stride]) * 4;
    257 
    258       // fdct4(step, step);
    259       x0 = s0 + s3;
    260       x1 = s1 + s2;
    261       x2 = s1 - s2;
    262       x3 = s0 - s3;
    263       t0 = (x0 + x1) * cospi_16_64;
    264       t1 = (x0 - x1) * cospi_16_64;
    265       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
    266       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
    267       output[0 * 8] = dct_const_round_shift(t0);
    268       output[2 * 8] = dct_const_round_shift(t2);
    269       output[4 * 8] = dct_const_round_shift(t1);
    270       output[6 * 8] = dct_const_round_shift(t3);
    271 
    272       // Stage 2
    273       t0 = (s6 - s5) * cospi_16_64;
    274       t1 = (s6 + s5) * cospi_16_64;
    275       t2 = dct_const_round_shift(t0);
    276       t3 = dct_const_round_shift(t1);
    277 
    278       // Stage 3
    279       x0 = s4 + t2;
    280       x1 = s4 - t2;
    281       x2 = s7 - t3;
    282       x3 = s7 + t3;
    283 
    284       // Stage 4
    285       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    286       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    287       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    288       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    289       output[1 * 8] = dct_const_round_shift(t0);
    290       output[3 * 8] = dct_const_round_shift(t2);
    291       output[5 * 8] = dct_const_round_shift(t1);
    292       output[7 * 8] = dct_const_round_shift(t3);
    293       input++;
    294       output++;
    295     }
    296   }
    297 
    298   // Rows
    299   for (i = 0; i < 8; ++i) {
    300     fdct8(&intermediate[i * 8], &final_output[i * 8]);
    301     for (j = 0; j < 8; ++j)
    302       final_output[j + i * 8] /= 2;
    303   }
    304 }
    305 
    306 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
    307   // The 2D transform is done with two passes which are actually pretty
    308   // similar. In the first one, we transform the columns and transpose
    309   // the results. In the second one, we transform the rows. To achieve that,
    310   // as the first pass results are transposed, we tranpose the columns (that
    311   // is the transposed rows) and transpose the results (so that it goes back
    312   // in normal/row positions).
    313   int pass;
    314   // We need an intermediate buffer between passes.
    315   int16_t intermediate[256];
    316   const int16_t *in = input;
    317   int16_t *out = intermediate;
    318   // Do the two transform/transpose passes
    319   for (pass = 0; pass < 2; ++pass) {
    320     /*canbe16*/ int step1[8];
    321     /*canbe16*/ int step2[8];
    322     /*canbe16*/ int step3[8];
    323     /*canbe16*/ int input[8];
    324     /*needs32*/ int temp1, temp2;
    325     int i;
    326     for (i = 0; i < 16; i++) {
    327       if (0 == pass) {
    328         // Calculate input for the first 8 results.
    329         input[0] = (in[0 * stride] + in[15 * stride]) * 4;
    330         input[1] = (in[1 * stride] + in[14 * stride]) * 4;
    331         input[2] = (in[2 * stride] + in[13 * stride]) * 4;
    332         input[3] = (in[3 * stride] + in[12 * stride]) * 4;
    333         input[4] = (in[4 * stride] + in[11 * stride]) * 4;
    334         input[5] = (in[5 * stride] + in[10 * stride]) * 4;
    335         input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
    336         input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
    337         // Calculate input for the next 8 results.
    338         step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
    339         step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
    340         step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
    341         step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
    342         step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
    343         step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
    344         step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
    345         step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
    346       } else {
    347         // Calculate input for the first 8 results.
    348         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
    349         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
    350         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
    351         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
    352         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
    353         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
    354         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
    355         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
    356         // Calculate input for the next 8 results.
    357         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
    358         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
    359         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
    360         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
    361         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
    362         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
    363         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
    364         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
    365       }
    366       // Work on the first eight values; fdct8(input, even_results);
    367       {
    368         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
    369         /*needs32*/ int t0, t1, t2, t3;
    370         /*canbe16*/ int x0, x1, x2, x3;
    371 
    372         // stage 1
    373         s0 = input[0] + input[7];
    374         s1 = input[1] + input[6];
    375         s2 = input[2] + input[5];
    376         s3 = input[3] + input[4];
    377         s4 = input[3] - input[4];
    378         s5 = input[2] - input[5];
    379         s6 = input[1] - input[6];
    380         s7 = input[0] - input[7];
    381 
    382         // fdct4(step, step);
    383         x0 = s0 + s3;
    384         x1 = s1 + s2;
    385         x2 = s1 - s2;
    386         x3 = s0 - s3;
    387         t0 = (x0 + x1) * cospi_16_64;
    388         t1 = (x0 - x1) * cospi_16_64;
    389         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    390         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    391         out[0] = dct_const_round_shift(t0);
    392         out[4] = dct_const_round_shift(t2);
    393         out[8] = dct_const_round_shift(t1);
    394         out[12] = dct_const_round_shift(t3);
    395 
    396         // Stage 2
    397         t0 = (s6 - s5) * cospi_16_64;
    398         t1 = (s6 + s5) * cospi_16_64;
    399         t2 = dct_const_round_shift(t0);
    400         t3 = dct_const_round_shift(t1);
    401 
    402         // Stage 3
    403         x0 = s4 + t2;
    404         x1 = s4 - t2;
    405         x2 = s7 - t3;
    406         x3 = s7 + t3;
    407 
    408         // Stage 4
    409         t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    410         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    411         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    412         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    413         out[2] = dct_const_round_shift(t0);
    414         out[6] = dct_const_round_shift(t2);
    415         out[10] = dct_const_round_shift(t1);
    416         out[14] = dct_const_round_shift(t3);
    417       }
    418       // Work on the next eight values; step1 -> odd_results
    419       {
    420         // step 2
    421         temp1 = (step1[5] - step1[2]) * cospi_16_64;
    422         temp2 = (step1[4] - step1[3]) * cospi_16_64;
    423         step2[2] = dct_const_round_shift(temp1);
    424         step2[3] = dct_const_round_shift(temp2);
    425         temp1 = (step1[4] + step1[3]) * cospi_16_64;
    426         temp2 = (step1[5] + step1[2]) * cospi_16_64;
    427         step2[4] = dct_const_round_shift(temp1);
    428         step2[5] = dct_const_round_shift(temp2);
    429         // step 3
    430         step3[0] = step1[0] + step2[3];
    431         step3[1] = step1[1] + step2[2];
    432         step3[2] = step1[1] - step2[2];
    433         step3[3] = step1[0] - step2[3];
    434         step3[4] = step1[7] - step2[4];
    435         step3[5] = step1[6] - step2[5];
    436         step3[6] = step1[6] + step2[5];
    437         step3[7] = step1[7] + step2[4];
    438         // step 4
    439         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
    440         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
    441         step2[1] = dct_const_round_shift(temp1);
    442         step2[2] = dct_const_round_shift(temp2);
    443         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
    444         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
    445         step2[5] = dct_const_round_shift(temp1);
    446         step2[6] = dct_const_round_shift(temp2);
    447         // step 5
    448         step1[0] = step3[0] + step2[1];
    449         step1[1] = step3[0] - step2[1];
    450         step1[2] = step3[3] - step2[2];
    451         step1[3] = step3[3] + step2[2];
    452         step1[4] = step3[4] + step2[5];
    453         step1[5] = step3[4] - step2[5];
    454         step1[6] = step3[7] - step2[6];
    455         step1[7] = step3[7] + step2[6];
    456         // step 6
    457         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
    458         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
    459         out[1] = dct_const_round_shift(temp1);
    460         out[9] = dct_const_round_shift(temp2);
    461         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
    462         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
    463         out[5] = dct_const_round_shift(temp1);
    464         out[13] = dct_const_round_shift(temp2);
    465         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
    466         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
    467         out[3] = dct_const_round_shift(temp1);
    468         out[11] = dct_const_round_shift(temp2);
    469         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
    470         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
    471         out[7] = dct_const_round_shift(temp1);
    472         out[15] = dct_const_round_shift(temp2);
    473       }
    474       // Do next column (which is a transposed row in second/horizontal pass)
    475       in++;
    476       out += 16;
    477     }
    478     // Setup in/out for next pass.
    479     in = intermediate;
    480     out = output;
    481   }
    482 }
    483 
    484 static void fadst8(const int16_t *input, int16_t *output) {
    485   int s0, s1, s2, s3, s4, s5, s6, s7;
    486 
    487   int x0 = input[7];
    488   int x1 = input[0];
    489   int x2 = input[5];
    490   int x3 = input[2];
    491   int x4 = input[3];
    492   int x5 = input[4];
    493   int x6 = input[1];
    494   int x7 = input[6];
    495 
    496   // stage 1
    497   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
    498   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
    499   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    500   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    501   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    502   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    503   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
    504   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
    505 
    506   x0 = dct_const_round_shift(s0 + s4);
    507   x1 = dct_const_round_shift(s1 + s5);
    508   x2 = dct_const_round_shift(s2 + s6);
    509   x3 = dct_const_round_shift(s3 + s7);
    510   x4 = dct_const_round_shift(s0 - s4);
    511   x5 = dct_const_round_shift(s1 - s5);
    512   x6 = dct_const_round_shift(s2 - s6);
    513   x7 = dct_const_round_shift(s3 - s7);
    514 
    515   // stage 2
    516   s0 = x0;
    517   s1 = x1;
    518   s2 = x2;
    519   s3 = x3;
    520   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
    521   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
    522   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
    523   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
    524 
    525   x0 = s0 + s2;
    526   x1 = s1 + s3;
    527   x2 = s0 - s2;
    528   x3 = s1 - s3;
    529   x4 = dct_const_round_shift(s4 + s6);
    530   x5 = dct_const_round_shift(s5 + s7);
    531   x6 = dct_const_round_shift(s4 - s6);
    532   x7 = dct_const_round_shift(s5 - s7);
    533 
    534   // stage 3
    535   s2 = cospi_16_64 * (x2 + x3);
    536   s3 = cospi_16_64 * (x2 - x3);
    537   s6 = cospi_16_64 * (x6 + x7);
    538   s7 = cospi_16_64 * (x6 - x7);
    539 
    540   x2 = dct_const_round_shift(s2);
    541   x3 = dct_const_round_shift(s3);
    542   x6 = dct_const_round_shift(s6);
    543   x7 = dct_const_round_shift(s7);
    544 
    545   output[0] =   x0;
    546   output[1] = - x4;
    547   output[2] =   x6;
    548   output[3] = - x2;
    549   output[4] =   x3;
    550   output[5] = - x7;
    551   output[6] =   x5;
    552   output[7] = - x1;
    553 }
    554 
    555 static const transform_2d FHT_8[] = {
    556   { fdct8,  fdct8  },  // DCT_DCT  = 0
    557   { fadst8, fdct8  },  // ADST_DCT = 1
    558   { fdct8,  fadst8 },  // DCT_ADST = 2
    559   { fadst8, fadst8 }   // ADST_ADST = 3
    560 };
    561 
    562 void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
    563                         int stride, int tx_type) {
    564   int16_t out[64];
    565   int16_t *outptr = &out[0];
    566   int i, j;
    567   int16_t temp_in[8], temp_out[8];
    568   const transform_2d ht = FHT_8[tx_type];
    569 
    570   // Columns
    571   for (i = 0; i < 8; ++i) {
    572     for (j = 0; j < 8; ++j)
    573       temp_in[j] = input[j * stride + i] * 4;
    574     ht.cols(temp_in, temp_out);
    575     for (j = 0; j < 8; ++j)
    576       outptr[j * 8 + i] = temp_out[j];
    577   }
    578 
    579   // Rows
    580   for (i = 0; i < 8; ++i) {
    581     for (j = 0; j < 8; ++j)
    582       temp_in[j] = out[j + i * 8];
    583     ht.rows(temp_in, temp_out);
    584     for (j = 0; j < 8; ++j)
    585       output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    586   }
    587 }
    588 
    589 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    590    pixel. */
    591 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
    592   int i;
    593   int a1, b1, c1, d1, e1;
    594   const int16_t *ip = input;
    595   int16_t *op = output;
    596 
    597   for (i = 0; i < 4; i++) {
    598     a1 = ip[0 * stride];
    599     b1 = ip[1 * stride];
    600     c1 = ip[2 * stride];
    601     d1 = ip[3 * stride];
    602 
    603     a1 += b1;
    604     d1 = d1 - c1;
    605     e1 = (a1 - d1) >> 1;
    606     b1 = e1 - b1;
    607     c1 = e1 - c1;
    608     a1 -= c1;
    609     d1 += b1;
    610     op[0] = a1;
    611     op[4] = c1;
    612     op[8] = d1;
    613     op[12] = b1;
    614 
    615     ip++;
    616     op++;
    617   }
    618   ip = output;
    619   op = output;
    620 
    621   for (i = 0; i < 4; i++) {
    622     a1 = ip[0];
    623     b1 = ip[1];
    624     c1 = ip[2];
    625     d1 = ip[3];
    626 
    627     a1 += b1;
    628     d1 -= c1;
    629     e1 = (a1 - d1) >> 1;
    630     b1 = e1 - b1;
    631     c1 = e1 - c1;
    632     a1 -= c1;
    633     d1 += b1;
    634     op[0] = a1 * UNIT_QUANT_FACTOR;
    635     op[1] = c1 * UNIT_QUANT_FACTOR;
    636     op[2] = d1 * UNIT_QUANT_FACTOR;
    637     op[3] = b1 * UNIT_QUANT_FACTOR;
    638 
    639     ip += 4;
    640     op += 4;
    641   }
    642 }
    643 
    644 // Rewrote to use same algorithm as others.
    645 static void fdct16(const int16_t in[16], int16_t out[16]) {
    646   /*canbe16*/ int step1[8];
    647   /*canbe16*/ int step2[8];
    648   /*canbe16*/ int step3[8];
    649   /*canbe16*/ int input[8];
    650   /*needs32*/ int temp1, temp2;
    651 
    652   // step 1
    653   input[0] = in[0] + in[15];
    654   input[1] = in[1] + in[14];
    655   input[2] = in[2] + in[13];
    656   input[3] = in[3] + in[12];
    657   input[4] = in[4] + in[11];
    658   input[5] = in[5] + in[10];
    659   input[6] = in[6] + in[ 9];
    660   input[7] = in[7] + in[ 8];
    661 
    662   step1[0] = in[7] - in[ 8];
    663   step1[1] = in[6] - in[ 9];
    664   step1[2] = in[5] - in[10];
    665   step1[3] = in[4] - in[11];
    666   step1[4] = in[3] - in[12];
    667   step1[5] = in[2] - in[13];
    668   step1[6] = in[1] - in[14];
    669   step1[7] = in[0] - in[15];
    670 
    671   // fdct8(step, step);
    672   {
    673     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
    674     /*needs32*/ int t0, t1, t2, t3;
    675     /*canbe16*/ int x0, x1, x2, x3;
    676 
    677     // stage 1
    678     s0 = input[0] + input[7];
    679     s1 = input[1] + input[6];
    680     s2 = input[2] + input[5];
    681     s3 = input[3] + input[4];
    682     s4 = input[3] - input[4];
    683     s5 = input[2] - input[5];
    684     s6 = input[1] - input[6];
    685     s7 = input[0] - input[7];
    686 
    687     // fdct4(step, step);
    688     x0 = s0 + s3;
    689     x1 = s1 + s2;
    690     x2 = s1 - s2;
    691     x3 = s0 - s3;
    692     t0 = (x0 + x1) * cospi_16_64;
    693     t1 = (x0 - x1) * cospi_16_64;
    694     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    695     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    696     out[0] = dct_const_round_shift(t0);
    697     out[4] = dct_const_round_shift(t2);
    698     out[8] = dct_const_round_shift(t1);
    699     out[12] = dct_const_round_shift(t3);
    700 
    701     // Stage 2
    702     t0 = (s6 - s5) * cospi_16_64;
    703     t1 = (s6 + s5) * cospi_16_64;
    704     t2 = dct_const_round_shift(t0);
    705     t3 = dct_const_round_shift(t1);
    706 
    707     // Stage 3
    708     x0 = s4 + t2;
    709     x1 = s4 - t2;
    710     x2 = s7 - t3;
    711     x3 = s7 + t3;
    712 
    713     // Stage 4
    714     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    715     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    716     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    717     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    718     out[2] = dct_const_round_shift(t0);
    719     out[6] = dct_const_round_shift(t2);
    720     out[10] = dct_const_round_shift(t1);
    721     out[14] = dct_const_round_shift(t3);
    722   }
    723 
    724   // step 2
    725   temp1 = (step1[5] - step1[2]) * cospi_16_64;
    726   temp2 = (step1[4] - step1[3]) * cospi_16_64;
    727   step2[2] = dct_const_round_shift(temp1);
    728   step2[3] = dct_const_round_shift(temp2);
    729   temp1 = (step1[4] + step1[3]) * cospi_16_64;
    730   temp2 = (step1[5] + step1[2]) * cospi_16_64;
    731   step2[4] = dct_const_round_shift(temp1);
    732   step2[5] = dct_const_round_shift(temp2);
    733 
    734   // step 3
    735   step3[0] = step1[0] + step2[3];
    736   step3[1] = step1[1] + step2[2];
    737   step3[2] = step1[1] - step2[2];
    738   step3[3] = step1[0] - step2[3];
    739   step3[4] = step1[7] - step2[4];
    740   step3[5] = step1[6] - step2[5];
    741   step3[6] = step1[6] + step2[5];
    742   step3[7] = step1[7] + step2[4];
    743 
    744   // step 4
    745   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
    746   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
    747   step2[1] = dct_const_round_shift(temp1);
    748   step2[2] = dct_const_round_shift(temp2);
    749   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
    750   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
    751   step2[5] = dct_const_round_shift(temp1);
    752   step2[6] = dct_const_round_shift(temp2);
    753 
    754   // step 5
    755   step1[0] = step3[0] + step2[1];
    756   step1[1] = step3[0] - step2[1];
    757   step1[2] = step3[3] - step2[2];
    758   step1[3] = step3[3] + step2[2];
    759   step1[4] = step3[4] + step2[5];
    760   step1[5] = step3[4] - step2[5];
    761   step1[6] = step3[7] - step2[6];
    762   step1[7] = step3[7] + step2[6];
    763 
    764   // step 6
    765   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
    766   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
    767   out[1] = dct_const_round_shift(temp1);
    768   out[9] = dct_const_round_shift(temp2);
    769 
    770   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
    771   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
    772   out[5] = dct_const_round_shift(temp1);
    773   out[13] = dct_const_round_shift(temp2);
    774 
    775   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
    776   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
    777   out[3] = dct_const_round_shift(temp1);
    778   out[11] = dct_const_round_shift(temp2);
    779 
    780   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
    781   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
    782   out[7] = dct_const_round_shift(temp1);
    783   out[15] = dct_const_round_shift(temp2);
    784 }
    785 
    786 static void fadst16(const int16_t *input, int16_t *output) {
    787   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
    788 
    789   int x0 = input[15];
    790   int x1 = input[0];
    791   int x2 = input[13];
    792   int x3 = input[2];
    793   int x4 = input[11];
    794   int x5 = input[4];
    795   int x6 = input[9];
    796   int x7 = input[6];
    797   int x8 = input[7];
    798   int x9 = input[8];
    799   int x10 = input[5];
    800   int x11 = input[10];
    801   int x12 = input[3];
    802   int x13 = input[12];
    803   int x14 = input[1];
    804   int x15 = input[14];
    805 
    806   // stage 1
    807   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
    808   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    809   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
    810   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    811   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
    812   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    813   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    814   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    815   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    816   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    817   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    818   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    819   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    820   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
    821   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    822   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
    823 
    824   x0 = dct_const_round_shift(s0 + s8);
    825   x1 = dct_const_round_shift(s1 + s9);
    826   x2 = dct_const_round_shift(s2 + s10);
    827   x3 = dct_const_round_shift(s3 + s11);
    828   x4 = dct_const_round_shift(s4 + s12);
    829   x5 = dct_const_round_shift(s5 + s13);
    830   x6 = dct_const_round_shift(s6 + s14);
    831   x7 = dct_const_round_shift(s7 + s15);
    832   x8  = dct_const_round_shift(s0 - s8);
    833   x9  = dct_const_round_shift(s1 - s9);
    834   x10 = dct_const_round_shift(s2 - s10);
    835   x11 = dct_const_round_shift(s3 - s11);
    836   x12 = dct_const_round_shift(s4 - s12);
    837   x13 = dct_const_round_shift(s5 - s13);
    838   x14 = dct_const_round_shift(s6 - s14);
    839   x15 = dct_const_round_shift(s7 - s15);
    840 
    841   // stage 2
    842   s0 = x0;
    843   s1 = x1;
    844   s2 = x2;
    845   s3 = x3;
    846   s4 = x4;
    847   s5 = x5;
    848   s6 = x6;
    849   s7 = x7;
    850   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
    851   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
    852   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
    853   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
    854   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
    855   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
    856   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
    857   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
    858 
    859   x0 = s0 + s4;
    860   x1 = s1 + s5;
    861   x2 = s2 + s6;
    862   x3 = s3 + s7;
    863   x4 = s0 - s4;
    864   x5 = s1 - s5;
    865   x6 = s2 - s6;
    866   x7 = s3 - s7;
    867   x8 = dct_const_round_shift(s8 + s12);
    868   x9 = dct_const_round_shift(s9 + s13);
    869   x10 = dct_const_round_shift(s10 + s14);
    870   x11 = dct_const_round_shift(s11 + s15);
    871   x12 = dct_const_round_shift(s8 - s12);
    872   x13 = dct_const_round_shift(s9 - s13);
    873   x14 = dct_const_round_shift(s10 - s14);
    874   x15 = dct_const_round_shift(s11 - s15);
    875 
    876   // stage 3
    877   s0 = x0;
    878   s1 = x1;
    879   s2 = x2;
    880   s3 = x3;
    881   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
    882   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    883   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
    884   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
    885   s8 = x8;
    886   s9 = x9;
    887   s10 = x10;
    888   s11 = x11;
    889   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
    890   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    891   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
    892   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
    893 
    894   x0 = s0 + s2;
    895   x1 = s1 + s3;
    896   x2 = s0 - s2;
    897   x3 = s1 - s3;
    898   x4 = dct_const_round_shift(s4 + s6);
    899   x5 = dct_const_round_shift(s5 + s7);
    900   x6 = dct_const_round_shift(s4 - s6);
    901   x7 = dct_const_round_shift(s5 - s7);
    902   x8 = s8 + s10;
    903   x9 = s9 + s11;
    904   x10 = s8 - s10;
    905   x11 = s9 - s11;
    906   x12 = dct_const_round_shift(s12 + s14);
    907   x13 = dct_const_round_shift(s13 + s15);
    908   x14 = dct_const_round_shift(s12 - s14);
    909   x15 = dct_const_round_shift(s13 - s15);
    910 
    911   // stage 4
    912   s2 = (- cospi_16_64) * (x2 + x3);
    913   s3 = cospi_16_64 * (x2 - x3);
    914   s6 = cospi_16_64 * (x6 + x7);
    915   s7 = cospi_16_64 * (- x6 + x7);
    916   s10 = cospi_16_64 * (x10 + x11);
    917   s11 = cospi_16_64 * (- x10 + x11);
    918   s14 = (- cospi_16_64) * (x14 + x15);
    919   s15 = cospi_16_64 * (x14 - x15);
    920 
    921   x2 = dct_const_round_shift(s2);
    922   x3 = dct_const_round_shift(s3);
    923   x6 = dct_const_round_shift(s6);
    924   x7 = dct_const_round_shift(s7);
    925   x10 = dct_const_round_shift(s10);
    926   x11 = dct_const_round_shift(s11);
    927   x14 = dct_const_round_shift(s14);
    928   x15 = dct_const_round_shift(s15);
    929 
    930   output[0] = x0;
    931   output[1] = - x8;
    932   output[2] = x12;
    933   output[3] = - x4;
    934   output[4] = x6;
    935   output[5] = x14;
    936   output[6] = x10;
    937   output[7] = x2;
    938   output[8] = x3;
    939   output[9] =  x11;
    940   output[10] = x15;
    941   output[11] = x7;
    942   output[12] = x5;
    943   output[13] = - x13;
    944   output[14] = x9;
    945   output[15] = - x1;
    946 }
    947 
    948 static const transform_2d FHT_16[] = {
    949   { fdct16,  fdct16  },  // DCT_DCT  = 0
    950   { fadst16, fdct16  },  // ADST_DCT = 1
    951   { fdct16,  fadst16 },  // DCT_ADST = 2
    952   { fadst16, fadst16 }   // ADST_ADST = 3
    953 };
    954 
    955 void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
    956                           int stride, int tx_type) {
    957   int16_t out[256];
    958   int16_t *outptr = &out[0];
    959   int i, j;
    960   int16_t temp_in[16], temp_out[16];
    961   const transform_2d ht = FHT_16[tx_type];
    962 
    963   // Columns
    964   for (i = 0; i < 16; ++i) {
    965     for (j = 0; j < 16; ++j)
    966       temp_in[j] = input[j * stride + i] * 4;
    967     ht.cols(temp_in, temp_out);
    968     for (j = 0; j < 16; ++j)
    969       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
    970 //      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
    971   }
    972 
    973   // Rows
    974   for (i = 0; i < 16; ++i) {
    975     for (j = 0; j < 16; ++j)
    976       temp_in[j] = out[j + i * 16];
    977     ht.rows(temp_in, temp_out);
    978     for (j = 0; j < 16; ++j)
    979       output[j + i * 16] = temp_out[j];
    980   }
    981 }
    982 
    983 static INLINE int dct_32_round(int input) {
    984   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
    985   assert(-131072 <= rv && rv <= 131071);
    986   return rv;
    987 }
    988 
    989 static INLINE int half_round_shift(int input) {
    990   int rv = (input + 1 + (input < 0)) >> 2;
    991   return rv;
    992 }
    993 
    994 static void dct32_1d(const int *input, int *output, int round) {
    995   int step[32];
    996   // Stage 1
    997   step[0] = input[0] + input[(32 - 1)];
    998   step[1] = input[1] + input[(32 - 2)];
    999   step[2] = input[2] + input[(32 - 3)];
   1000   step[3] = input[3] + input[(32 - 4)];
   1001   step[4] = input[4] + input[(32 - 5)];
   1002   step[5] = input[5] + input[(32 - 6)];
   1003   step[6] = input[6] + input[(32 - 7)];
   1004   step[7] = input[7] + input[(32 - 8)];
   1005   step[8] = input[8] + input[(32 - 9)];
   1006   step[9] = input[9] + input[(32 - 10)];
   1007   step[10] = input[10] + input[(32 - 11)];
   1008   step[11] = input[11] + input[(32 - 12)];
   1009   step[12] = input[12] + input[(32 - 13)];
   1010   step[13] = input[13] + input[(32 - 14)];
   1011   step[14] = input[14] + input[(32 - 15)];
   1012   step[15] = input[15] + input[(32 - 16)];
   1013   step[16] = -input[16] + input[(32 - 17)];
   1014   step[17] = -input[17] + input[(32 - 18)];
   1015   step[18] = -input[18] + input[(32 - 19)];
   1016   step[19] = -input[19] + input[(32 - 20)];
   1017   step[20] = -input[20] + input[(32 - 21)];
   1018   step[21] = -input[21] + input[(32 - 22)];
   1019   step[22] = -input[22] + input[(32 - 23)];
   1020   step[23] = -input[23] + input[(32 - 24)];
   1021   step[24] = -input[24] + input[(32 - 25)];
   1022   step[25] = -input[25] + input[(32 - 26)];
   1023   step[26] = -input[26] + input[(32 - 27)];
   1024   step[27] = -input[27] + input[(32 - 28)];
   1025   step[28] = -input[28] + input[(32 - 29)];
   1026   step[29] = -input[29] + input[(32 - 30)];
   1027   step[30] = -input[30] + input[(32 - 31)];
   1028   step[31] = -input[31] + input[(32 - 32)];
   1029 
   1030   // Stage 2
   1031   output[0] = step[0] + step[16 - 1];
   1032   output[1] = step[1] + step[16 - 2];
   1033   output[2] = step[2] + step[16 - 3];
   1034   output[3] = step[3] + step[16 - 4];
   1035   output[4] = step[4] + step[16 - 5];
   1036   output[5] = step[5] + step[16 - 6];
   1037   output[6] = step[6] + step[16 - 7];
   1038   output[7] = step[7] + step[16 - 8];
   1039   output[8] = -step[8] + step[16 - 9];
   1040   output[9] = -step[9] + step[16 - 10];
   1041   output[10] = -step[10] + step[16 - 11];
   1042   output[11] = -step[11] + step[16 - 12];
   1043   output[12] = -step[12] + step[16 - 13];
   1044   output[13] = -step[13] + step[16 - 14];
   1045   output[14] = -step[14] + step[16 - 15];
   1046   output[15] = -step[15] + step[16 - 16];
   1047 
   1048   output[16] = step[16];
   1049   output[17] = step[17];
   1050   output[18] = step[18];
   1051   output[19] = step[19];
   1052 
   1053   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
   1054   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
   1055   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
   1056   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
   1057 
   1058   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
   1059   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
   1060   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
   1061   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
   1062 
   1063   output[28] = step[28];
   1064   output[29] = step[29];
   1065   output[30] = step[30];
   1066   output[31] = step[31];
   1067 
   1068   // dump the magnitude by 4, hence the intermediate values are within
   1069   // the range of 16 bits.
   1070   if (round) {
   1071     output[0] = half_round_shift(output[0]);
   1072     output[1] = half_round_shift(output[1]);
   1073     output[2] = half_round_shift(output[2]);
   1074     output[3] = half_round_shift(output[3]);
   1075     output[4] = half_round_shift(output[4]);
   1076     output[5] = half_round_shift(output[5]);
   1077     output[6] = half_round_shift(output[6]);
   1078     output[7] = half_round_shift(output[7]);
   1079     output[8] = half_round_shift(output[8]);
   1080     output[9] = half_round_shift(output[9]);
   1081     output[10] = half_round_shift(output[10]);
   1082     output[11] = half_round_shift(output[11]);
   1083     output[12] = half_round_shift(output[12]);
   1084     output[13] = half_round_shift(output[13]);
   1085     output[14] = half_round_shift(output[14]);
   1086     output[15] = half_round_shift(output[15]);
   1087 
   1088     output[16] = half_round_shift(output[16]);
   1089     output[17] = half_round_shift(output[17]);
   1090     output[18] = half_round_shift(output[18]);
   1091     output[19] = half_round_shift(output[19]);
   1092     output[20] = half_round_shift(output[20]);
   1093     output[21] = half_round_shift(output[21]);
   1094     output[22] = half_round_shift(output[22]);
   1095     output[23] = half_round_shift(output[23]);
   1096     output[24] = half_round_shift(output[24]);
   1097     output[25] = half_round_shift(output[25]);
   1098     output[26] = half_round_shift(output[26]);
   1099     output[27] = half_round_shift(output[27]);
   1100     output[28] = half_round_shift(output[28]);
   1101     output[29] = half_round_shift(output[29]);
   1102     output[30] = half_round_shift(output[30]);
   1103     output[31] = half_round_shift(output[31]);
   1104   }
   1105 
   1106   // Stage 3
   1107   step[0] = output[0] + output[(8 - 1)];
   1108   step[1] = output[1] + output[(8 - 2)];
   1109   step[2] = output[2] + output[(8 - 3)];
   1110   step[3] = output[3] + output[(8 - 4)];
   1111   step[4] = -output[4] + output[(8 - 5)];
   1112   step[5] = -output[5] + output[(8 - 6)];
   1113   step[6] = -output[6] + output[(8 - 7)];
   1114   step[7] = -output[7] + output[(8 - 8)];
   1115   step[8] = output[8];
   1116   step[9] = output[9];
   1117   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
   1118   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
   1119   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
   1120   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
   1121   step[14] = output[14];
   1122   step[15] = output[15];
   1123 
   1124   step[16] = output[16] + output[23];
   1125   step[17] = output[17] + output[22];
   1126   step[18] = output[18] + output[21];
   1127   step[19] = output[19] + output[20];
   1128   step[20] = -output[20] + output[19];
   1129   step[21] = -output[21] + output[18];
   1130   step[22] = -output[22] + output[17];
   1131   step[23] = -output[23] + output[16];
   1132   step[24] = -output[24] + output[31];
   1133   step[25] = -output[25] + output[30];
   1134   step[26] = -output[26] + output[29];
   1135   step[27] = -output[27] + output[28];
   1136   step[28] = output[28] + output[27];
   1137   step[29] = output[29] + output[26];
   1138   step[30] = output[30] + output[25];
   1139   step[31] = output[31] + output[24];
   1140 
   1141   // Stage 4
   1142   output[0] = step[0] + step[3];
   1143   output[1] = step[1] + step[2];
   1144   output[2] = -step[2] + step[1];
   1145   output[3] = -step[3] + step[0];
   1146   output[4] = step[4];
   1147   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
   1148   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
   1149   output[7] = step[7];
   1150   output[8] = step[8] + step[11];
   1151   output[9] = step[9] + step[10];
   1152   output[10] = -step[10] + step[9];
   1153   output[11] = -step[11] + step[8];
   1154   output[12] = -step[12] + step[15];
   1155   output[13] = -step[13] + step[14];
   1156   output[14] = step[14] + step[13];
   1157   output[15] = step[15] + step[12];
   1158 
   1159   output[16] = step[16];
   1160   output[17] = step[17];
   1161   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
   1162   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
   1163   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
   1164   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
   1165   output[22] = step[22];
   1166   output[23] = step[23];
   1167   output[24] = step[24];
   1168   output[25] = step[25];
   1169   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
   1170   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
   1171   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
   1172   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
   1173   output[30] = step[30];
   1174   output[31] = step[31];
   1175 
   1176   // Stage 5
   1177   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
   1178   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
   1179   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
   1180   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
   1181   step[4] = output[4] + output[5];
   1182   step[5] = -output[5] + output[4];
   1183   step[6] = -output[6] + output[7];
   1184   step[7] = output[7] + output[6];
   1185   step[8] = output[8];
   1186   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
   1187   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
   1188   step[11] = output[11];
   1189   step[12] = output[12];
   1190   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
   1191   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
   1192   step[15] = output[15];
   1193 
   1194   step[16] = output[16] + output[19];
   1195   step[17] = output[17] + output[18];
   1196   step[18] = -output[18] + output[17];
   1197   step[19] = -output[19] + output[16];
   1198   step[20] = -output[20] + output[23];
   1199   step[21] = -output[21] + output[22];
   1200   step[22] = output[22] + output[21];
   1201   step[23] = output[23] + output[20];
   1202   step[24] = output[24] + output[27];
   1203   step[25] = output[25] + output[26];
   1204   step[26] = -output[26] + output[25];
   1205   step[27] = -output[27] + output[24];
   1206   step[28] = -output[28] + output[31];
   1207   step[29] = -output[29] + output[30];
   1208   step[30] = output[30] + output[29];
   1209   step[31] = output[31] + output[28];
   1210 
   1211   // Stage 6
   1212   output[0] = step[0];
   1213   output[1] = step[1];
   1214   output[2] = step[2];
   1215   output[3] = step[3];
   1216   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
   1217   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
   1218   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
   1219   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
   1220   output[8] = step[8] + step[9];
   1221   output[9] = -step[9] + step[8];
   1222   output[10] = -step[10] + step[11];
   1223   output[11] = step[11] + step[10];
   1224   output[12] = step[12] + step[13];
   1225   output[13] = -step[13] + step[12];
   1226   output[14] = -step[14] + step[15];
   1227   output[15] = step[15] + step[14];
   1228 
   1229   output[16] = step[16];
   1230   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
   1231   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
   1232   output[19] = step[19];
   1233   output[20] = step[20];
   1234   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
   1235   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
   1236   output[23] = step[23];
   1237   output[24] = step[24];
   1238   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
   1239   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
   1240   output[27] = step[27];
   1241   output[28] = step[28];
   1242   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
   1243   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
   1244   output[31] = step[31];
   1245 
   1246   // Stage 7
   1247   step[0] = output[0];
   1248   step[1] = output[1];
   1249   step[2] = output[2];
   1250   step[3] = output[3];
   1251   step[4] = output[4];
   1252   step[5] = output[5];
   1253   step[6] = output[6];
   1254   step[7] = output[7];
   1255   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
   1256   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
   1257   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
   1258   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
   1259   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
   1260   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
   1261   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
   1262   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
   1263 
   1264   step[16] = output[16] + output[17];
   1265   step[17] = -output[17] + output[16];
   1266   step[18] = -output[18] + output[19];
   1267   step[19] = output[19] + output[18];
   1268   step[20] = output[20] + output[21];
   1269   step[21] = -output[21] + output[20];
   1270   step[22] = -output[22] + output[23];
   1271   step[23] = output[23] + output[22];
   1272   step[24] = output[24] + output[25];
   1273   step[25] = -output[25] + output[24];
   1274   step[26] = -output[26] + output[27];
   1275   step[27] = output[27] + output[26];
   1276   step[28] = output[28] + output[29];
   1277   step[29] = -output[29] + output[28];
   1278   step[30] = -output[30] + output[31];
   1279   step[31] = output[31] + output[30];
   1280 
   1281   // Final stage --- outputs indices are bit-reversed.
   1282   output[0]  = step[0];
   1283   output[16] = step[1];
   1284   output[8]  = step[2];
   1285   output[24] = step[3];
   1286   output[4]  = step[4];
   1287   output[20] = step[5];
   1288   output[12] = step[6];
   1289   output[28] = step[7];
   1290   output[2]  = step[8];
   1291   output[18] = step[9];
   1292   output[10] = step[10];
   1293   output[26] = step[11];
   1294   output[6]  = step[12];
   1295   output[22] = step[13];
   1296   output[14] = step[14];
   1297   output[30] = step[15];
   1298 
   1299   output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
   1300   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
   1301   output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
   1302   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
   1303   output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
   1304   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
   1305   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
   1306   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
   1307   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
   1308   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
   1309   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
   1310   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
   1311   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
   1312   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
   1313   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
   1314   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
   1315 }
   1316 
   1317 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
   1318   int i, j;
   1319   int output[32 * 32];
   1320 
   1321   // Columns
   1322   for (i = 0; i < 32; ++i) {
   1323     int temp_in[32], temp_out[32];
   1324     for (j = 0; j < 32; ++j)
   1325       temp_in[j] = input[j * stride + i] * 4;
   1326     dct32_1d(temp_in, temp_out, 0);
   1327     for (j = 0; j < 32; ++j)
   1328       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   1329   }
   1330 
   1331   // Rows
   1332   for (i = 0; i < 32; ++i) {
   1333     int temp_in[32], temp_out[32];
   1334     for (j = 0; j < 32; ++j)
   1335       temp_in[j] = output[j + i * 32];
   1336     dct32_1d(temp_in, temp_out, 0);
   1337     for (j = 0; j < 32; ++j)
   1338       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   1339   }
   1340 }
   1341 
   1342 // Note that although we use dct_32_round in dct32_1d computation flow,
   1343 // this 2d fdct32x32 for rate-distortion optimization loop is operating
   1344 // within 16 bits precision.
   1345 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
   1346   int i, j;
   1347   int output[32 * 32];
   1348 
   1349   // Columns
   1350   for (i = 0; i < 32; ++i) {
   1351     int temp_in[32], temp_out[32];
   1352     for (j = 0; j < 32; ++j)
   1353       temp_in[j] = input[j * stride + i] * 4;
   1354     dct32_1d(temp_in, temp_out, 0);
   1355     for (j = 0; j < 32; ++j)
   1356       // TODO(cd): see quality impact of only doing
   1357       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
   1358       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
   1359       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   1360   }
   1361 
   1362   // Rows
   1363   for (i = 0; i < 32; ++i) {
   1364     int temp_in[32], temp_out[32];
   1365     for (j = 0; j < 32; ++j)
   1366       temp_in[j] = output[j + i * 32];
   1367     dct32_1d(temp_in, temp_out, 1);
   1368     for (j = 0; j < 32; ++j)
   1369       out[j + i * 32] = temp_out[j];
   1370   }
   1371 }
   1372 
   1373 void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
   1374                 int stride) {
   1375   if (tx_type == DCT_DCT)
   1376     vp9_fdct4x4(input, output, stride);
   1377   else
   1378     vp9_short_fht4x4(input, output, stride, tx_type);
   1379 }
   1380 
   1381 void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
   1382                 int stride) {
   1383   if (tx_type == DCT_DCT)
   1384     vp9_fdct8x8(input, output, stride);
   1385   else
   1386     vp9_short_fht8x8(input, output, stride, tx_type);
   1387 }
   1388 
   1389 void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
   1390                   int stride) {
   1391   if (tx_type == DCT_DCT)
   1392     vp9_fdct16x16(input, output, stride);
   1393   else
   1394     vp9_short_fht16x16(input, output, stride, tx_type);
   1395 }
   1396