Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <math.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 
     17 #include "vp9/common/vp9_blockd.h"
     18 #include "vp9/common/vp9_idct.h"
     19 #include "vp9/common/vp9_systemdependent.h"
     20 
     21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
     22   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
     23   // TODO(debargha, peter.derivaz): Find new bounds for this assert
     24   // and make the bounds consts.
     25   // assert(INT16_MIN <= rv && rv <= INT16_MAX);
     26   return rv;
     27 }
     28 
     29 static void fdct4(const tran_low_t *input, tran_low_t *output) {
     30   tran_high_t step[4];
     31   tran_high_t temp1, temp2;
     32 
     33   step[0] = input[0] + input[3];
     34   step[1] = input[1] + input[2];
     35   step[2] = input[1] - input[2];
     36   step[3] = input[0] - input[3];
     37 
     38   temp1 = (step[0] + step[1]) * cospi_16_64;
     39   temp2 = (step[0] - step[1]) * cospi_16_64;
     40   output[0] = fdct_round_shift(temp1);
     41   output[2] = fdct_round_shift(temp2);
     42   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     43   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
     44   output[1] = fdct_round_shift(temp1);
     45   output[3] = fdct_round_shift(temp2);
     46 }
     47 
     48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
     49   int r, c;
     50   tran_low_t sum = 0;
     51   for (r = 0; r < 4; ++r)
     52     for (c = 0; c < 4; ++c)
     53       sum += input[r * stride + c];
     54 
     55   output[0] = sum << 1;
     56   output[1] = 0;
     57 }
     58 
     59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
     60   // The 2D transform is done with two passes which are actually pretty
     61   // similar. In the first one, we transform the columns and transpose
     62   // the results. In the second one, we transform the rows. To achieve that,
     63   // as the first pass results are transposed, we transpose the columns (that
     64   // is the transposed rows) and transpose the results (so that it goes back
     65   // in normal/row positions).
     66   int pass;
     67   // We need an intermediate buffer between passes.
     68   tran_low_t intermediate[4 * 4];
     69   const int16_t *in_pass0 = input;
     70   const tran_low_t *in = NULL;
     71   tran_low_t *out = intermediate;
     72   // Do the two transform/transpose passes
     73   for (pass = 0; pass < 2; ++pass) {
     74     tran_high_t input[4];      // canbe16
     75     tran_high_t step[4];       // canbe16
     76     tran_high_t temp1, temp2;  // needs32
     77     int i;
     78     for (i = 0; i < 4; ++i) {
     79       // Load inputs.
     80       if (0 == pass) {
     81         input[0] = in_pass0[0 * stride] * 16;
     82         input[1] = in_pass0[1 * stride] * 16;
     83         input[2] = in_pass0[2 * stride] * 16;
     84         input[3] = in_pass0[3 * stride] * 16;
     85         if (i == 0 && input[0]) {
     86           input[0] += 1;
     87         }
     88       } else {
     89         input[0] = in[0 * 4];
     90         input[1] = in[1 * 4];
     91         input[2] = in[2 * 4];
     92         input[3] = in[3 * 4];
     93       }
     94       // Transform.
     95       step[0] = input[0] + input[3];
     96       step[1] = input[1] + input[2];
     97       step[2] = input[1] - input[2];
     98       step[3] = input[0] - input[3];
     99       temp1 = (step[0] + step[1]) * cospi_16_64;
    100       temp2 = (step[0] - step[1]) * cospi_16_64;
    101       out[0] = fdct_round_shift(temp1);
    102       out[2] = fdct_round_shift(temp2);
    103       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
    104       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
    105       out[1] = fdct_round_shift(temp1);
    106       out[3] = fdct_round_shift(temp2);
    107       // Do next column (which is a transposed row in second/horizontal pass)
    108       in_pass0++;
    109       in++;
    110       out += 4;
    111     }
    112     // Setup in/out for next pass.
    113     in = intermediate;
    114     out = output;
    115   }
    116 
    117   {
    118     int i, j;
    119     for (i = 0; i < 4; ++i) {
    120       for (j = 0; j < 4; ++j)
    121         output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
    122     }
    123   }
    124 }
    125 
    126 static void fadst4(const tran_low_t *input, tran_low_t *output) {
    127   tran_high_t x0, x1, x2, x3;
    128   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    129 
    130   x0 = input[0];
    131   x1 = input[1];
    132   x2 = input[2];
    133   x3 = input[3];
    134 
    135   if (!(x0 | x1 | x2 | x3)) {
    136     output[0] = output[1] = output[2] = output[3] = 0;
    137     return;
    138   }
    139 
    140   s0 = sinpi_1_9 * x0;
    141   s1 = sinpi_4_9 * x0;
    142   s2 = sinpi_2_9 * x1;
    143   s3 = sinpi_1_9 * x1;
    144   s4 = sinpi_3_9 * x2;
    145   s5 = sinpi_4_9 * x3;
    146   s6 = sinpi_2_9 * x3;
    147   s7 = x0 + x1 - x3;
    148 
    149   x0 = s0 + s2 + s5;
    150   x1 = sinpi_3_9 * s7;
    151   x2 = s1 - s3 + s6;
    152   x3 = s4;
    153 
    154   s0 = x0 + x3;
    155   s1 = x1;
    156   s2 = x2 - x3;
    157   s3 = x2 - x0 + x3;
    158 
    159   // 1-D transform scaling factor is sqrt(2).
    160   output[0] = fdct_round_shift(s0);
    161   output[1] = fdct_round_shift(s1);
    162   output[2] = fdct_round_shift(s2);
    163   output[3] = fdct_round_shift(s3);
    164 }
    165 
    166 static const transform_2d FHT_4[] = {
    167   { fdct4,  fdct4  },  // DCT_DCT  = 0
    168   { fadst4, fdct4  },  // ADST_DCT = 1
    169   { fdct4,  fadst4 },  // DCT_ADST = 2
    170   { fadst4, fadst4 }   // ADST_ADST = 3
    171 };
    172 
    173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
    174                   int stride, int tx_type) {
    175   if (tx_type == DCT_DCT) {
    176     vp9_fdct4x4_c(input, output, stride);
    177   } else {
    178     tran_low_t out[4 * 4];
    179     tran_low_t *outptr = &out[0];
    180     int i, j;
    181     tran_low_t temp_in[4], temp_out[4];
    182     const transform_2d ht = FHT_4[tx_type];
    183 
    184     // Columns
    185     for (i = 0; i < 4; ++i) {
    186       for (j = 0; j < 4; ++j)
    187         temp_in[j] = input[j * stride + i] * 16;
    188       if (i == 0 && temp_in[0])
    189         temp_in[0] += 1;
    190       ht.cols(temp_in, temp_out);
    191       for (j = 0; j < 4; ++j)
    192         outptr[j * 4 + i] = temp_out[j];
    193     }
    194 
    195     // Rows
    196     for (i = 0; i < 4; ++i) {
    197       for (j = 0; j < 4; ++j)
    198         temp_in[j] = out[j + i * 4];
    199       ht.rows(temp_in, temp_out);
    200       for (j = 0; j < 4; ++j)
    201         output[j + i * 4] = (temp_out[j] + 1) >> 2;
    202     }
    203   }
    204 }
    205 
    206 static void fdct8(const tran_low_t *input, tran_low_t *output) {
    207   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    208   tran_high_t t0, t1, t2, t3;                  // needs32
    209   tran_high_t x0, x1, x2, x3;                  // canbe16
    210 
    211   // stage 1
    212   s0 = input[0] + input[7];
    213   s1 = input[1] + input[6];
    214   s2 = input[2] + input[5];
    215   s3 = input[3] + input[4];
    216   s4 = input[3] - input[4];
    217   s5 = input[2] - input[5];
    218   s6 = input[1] - input[6];
    219   s7 = input[0] - input[7];
    220 
    221   // fdct4(step, step);
    222   x0 = s0 + s3;
    223   x1 = s1 + s2;
    224   x2 = s1 - s2;
    225   x3 = s0 - s3;
    226   t0 = (x0 + x1) * cospi_16_64;
    227   t1 = (x0 - x1) * cospi_16_64;
    228   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
    229   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
    230   output[0] = fdct_round_shift(t0);
    231   output[2] = fdct_round_shift(t2);
    232   output[4] = fdct_round_shift(t1);
    233   output[6] = fdct_round_shift(t3);
    234 
    235   // Stage 2
    236   t0 = (s6 - s5) * cospi_16_64;
    237   t1 = (s6 + s5) * cospi_16_64;
    238   t2 = fdct_round_shift(t0);
    239   t3 = fdct_round_shift(t1);
    240 
    241   // Stage 3
    242   x0 = s4 + t2;
    243   x1 = s4 - t2;
    244   x2 = s7 - t3;
    245   x3 = s7 + t3;
    246 
    247   // Stage 4
    248   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    249   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    250   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    251   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    252   output[1] = fdct_round_shift(t0);
    253   output[3] = fdct_round_shift(t2);
    254   output[5] = fdct_round_shift(t1);
    255   output[7] = fdct_round_shift(t3);
    256 }
    257 
    258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
    259   int r, c;
    260   tran_low_t sum = 0;
    261   for (r = 0; r < 8; ++r)
    262     for (c = 0; c < 8; ++c)
    263       sum += input[r * stride + c];
    264 
    265   output[0] = sum;
    266   output[1] = 0;
    267 }
    268 
    269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
    270   int i, j;
    271   tran_low_t intermediate[64];
    272 
    273   // Transform columns
    274   {
    275     tran_low_t *output = intermediate;
    276     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    277     tran_high_t t0, t1, t2, t3;                  // needs32
    278     tran_high_t x0, x1, x2, x3;                  // canbe16
    279 
    280     int i;
    281     for (i = 0; i < 8; i++) {
    282       // stage 1
    283       s0 = (input[0 * stride] + input[7 * stride]) * 4;
    284       s1 = (input[1 * stride] + input[6 * stride]) * 4;
    285       s2 = (input[2 * stride] + input[5 * stride]) * 4;
    286       s3 = (input[3 * stride] + input[4 * stride]) * 4;
    287       s4 = (input[3 * stride] - input[4 * stride]) * 4;
    288       s5 = (input[2 * stride] - input[5 * stride]) * 4;
    289       s6 = (input[1 * stride] - input[6 * stride]) * 4;
    290       s7 = (input[0 * stride] - input[7 * stride]) * 4;
    291 
    292       // fdct4(step, step);
    293       x0 = s0 + s3;
    294       x1 = s1 + s2;
    295       x2 = s1 - s2;
    296       x3 = s0 - s3;
    297       t0 = (x0 + x1) * cospi_16_64;
    298       t1 = (x0 - x1) * cospi_16_64;
    299       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
    300       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
    301       output[0 * 8] = fdct_round_shift(t0);
    302       output[2 * 8] = fdct_round_shift(t2);
    303       output[4 * 8] = fdct_round_shift(t1);
    304       output[6 * 8] = fdct_round_shift(t3);
    305 
    306       // Stage 2
    307       t0 = (s6 - s5) * cospi_16_64;
    308       t1 = (s6 + s5) * cospi_16_64;
    309       t2 = fdct_round_shift(t0);
    310       t3 = fdct_round_shift(t1);
    311 
    312       // Stage 3
    313       x0 = s4 + t2;
    314       x1 = s4 - t2;
    315       x2 = s7 - t3;
    316       x3 = s7 + t3;
    317 
    318       // Stage 4
    319       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    320       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    321       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    322       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    323       output[1 * 8] = fdct_round_shift(t0);
    324       output[3 * 8] = fdct_round_shift(t2);
    325       output[5 * 8] = fdct_round_shift(t1);
    326       output[7 * 8] = fdct_round_shift(t3);
    327       input++;
    328       output++;
    329     }
    330   }
    331 
    332   // Rows
    333   for (i = 0; i < 8; ++i) {
    334     fdct8(&intermediate[i * 8], &final_output[i * 8]);
    335     for (j = 0; j < 8; ++j)
    336       final_output[j + i * 8] /= 2;
    337   }
    338 }
    339 
    340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
    341   int r, c;
    342   tran_low_t sum = 0;
    343   for (r = 0; r < 16; ++r)
    344     for (c = 0; c < 16; ++c)
    345       sum += input[r * stride + c];
    346 
    347   output[0] = sum >> 1;
    348   output[1] = 0;
    349 }
    350 
    351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
    352   // The 2D transform is done with two passes which are actually pretty
    353   // similar. In the first one, we transform the columns and transpose
    354   // the results. In the second one, we transform the rows. To achieve that,
    355   // as the first pass results are transposed, we transpose the columns (that
    356   // is the transposed rows) and transpose the results (so that it goes back
    357   // in normal/row positions).
    358   int pass;
    359   // We need an intermediate buffer between passes.
    360   tran_low_t intermediate[256];
    361   const int16_t *in_pass0 = input;
    362   const tran_low_t *in = NULL;
    363   tran_low_t *out = intermediate;
    364   // Do the two transform/transpose passes
    365   for (pass = 0; pass < 2; ++pass) {
    366     tran_high_t step1[8];      // canbe16
    367     tran_high_t step2[8];      // canbe16
    368     tran_high_t step3[8];      // canbe16
    369     tran_high_t input[8];      // canbe16
    370     tran_high_t temp1, temp2;  // needs32
    371     int i;
    372     for (i = 0; i < 16; i++) {
    373       if (0 == pass) {
    374         // Calculate input for the first 8 results.
    375         input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
    376         input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
    377         input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
    378         input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
    379         input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
    380         input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
    381         input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
    382         input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
    383         // Calculate input for the next 8 results.
    384         step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
    385         step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
    386         step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
    387         step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
    388         step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
    389         step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
    390         step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
    391         step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
    392       } else {
    393         // Calculate input for the first 8 results.
    394         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
    395         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
    396         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
    397         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
    398         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
    399         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
    400         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
    401         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
    402         // Calculate input for the next 8 results.
    403         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
    404         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
    405         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
    406         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
    407         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
    408         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
    409         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
    410         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
    411       }
    412       // Work on the first eight values; fdct8(input, even_results);
    413       {
    414         tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    415         tran_high_t t0, t1, t2, t3;                  // needs32
    416         tran_high_t x0, x1, x2, x3;                  // canbe16
    417 
    418         // stage 1
    419         s0 = input[0] + input[7];
    420         s1 = input[1] + input[6];
    421         s2 = input[2] + input[5];
    422         s3 = input[3] + input[4];
    423         s4 = input[3] - input[4];
    424         s5 = input[2] - input[5];
    425         s6 = input[1] - input[6];
    426         s7 = input[0] - input[7];
    427 
    428         // fdct4(step, step);
    429         x0 = s0 + s3;
    430         x1 = s1 + s2;
    431         x2 = s1 - s2;
    432         x3 = s0 - s3;
    433         t0 = (x0 + x1) * cospi_16_64;
    434         t1 = (x0 - x1) * cospi_16_64;
    435         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    436         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    437         out[0] = fdct_round_shift(t0);
    438         out[4] = fdct_round_shift(t2);
    439         out[8] = fdct_round_shift(t1);
    440         out[12] = fdct_round_shift(t3);
    441 
    442         // Stage 2
    443         t0 = (s6 - s5) * cospi_16_64;
    444         t1 = (s6 + s5) * cospi_16_64;
    445         t2 = fdct_round_shift(t0);
    446         t3 = fdct_round_shift(t1);
    447 
    448         // Stage 3
    449         x0 = s4 + t2;
    450         x1 = s4 - t2;
    451         x2 = s7 - t3;
    452         x3 = s7 + t3;
    453 
    454         // Stage 4
    455         t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    456         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    457         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    458         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    459         out[2] = fdct_round_shift(t0);
    460         out[6] = fdct_round_shift(t2);
    461         out[10] = fdct_round_shift(t1);
    462         out[14] = fdct_round_shift(t3);
    463       }
    464       // Work on the next eight values; step1 -> odd_results
    465       {
    466         // step 2
    467         temp1 = (step1[5] - step1[2]) * cospi_16_64;
    468         temp2 = (step1[4] - step1[3]) * cospi_16_64;
    469         step2[2] = fdct_round_shift(temp1);
    470         step2[3] = fdct_round_shift(temp2);
    471         temp1 = (step1[4] + step1[3]) * cospi_16_64;
    472         temp2 = (step1[5] + step1[2]) * cospi_16_64;
    473         step2[4] = fdct_round_shift(temp1);
    474         step2[5] = fdct_round_shift(temp2);
    475         // step 3
    476         step3[0] = step1[0] + step2[3];
    477         step3[1] = step1[1] + step2[2];
    478         step3[2] = step1[1] - step2[2];
    479         step3[3] = step1[0] - step2[3];
    480         step3[4] = step1[7] - step2[4];
    481         step3[5] = step1[6] - step2[5];
    482         step3[6] = step1[6] + step2[5];
    483         step3[7] = step1[7] + step2[4];
    484         // step 4
    485         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
    486         temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
    487         step2[1] = fdct_round_shift(temp1);
    488         step2[2] = fdct_round_shift(temp2);
    489         temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
    490         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
    491         step2[5] = fdct_round_shift(temp1);
    492         step2[6] = fdct_round_shift(temp2);
    493         // step 5
    494         step1[0] = step3[0] + step2[1];
    495         step1[1] = step3[0] - step2[1];
    496         step1[2] = step3[3] + step2[2];
    497         step1[3] = step3[3] - step2[2];
    498         step1[4] = step3[4] - step2[5];
    499         step1[5] = step3[4] + step2[5];
    500         step1[6] = step3[7] - step2[6];
    501         step1[7] = step3[7] + step2[6];
    502         // step 6
    503         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
    504         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
    505         out[1] = fdct_round_shift(temp1);
    506         out[9] = fdct_round_shift(temp2);
    507         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
    508         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
    509         out[5] = fdct_round_shift(temp1);
    510         out[13] = fdct_round_shift(temp2);
    511         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
    512         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
    513         out[3] = fdct_round_shift(temp1);
    514         out[11] = fdct_round_shift(temp2);
    515         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
    516         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
    517         out[7] = fdct_round_shift(temp1);
    518         out[15] = fdct_round_shift(temp2);
    519       }
    520       // Do next column (which is a transposed row in second/horizontal pass)
    521       in++;
    522       in_pass0++;
    523       out += 16;
    524     }
    525     // Setup in/out for next pass.
    526     in = intermediate;
    527     out = output;
    528   }
    529 }
    530 
    531 static void fadst8(const tran_low_t *input, tran_low_t *output) {
    532   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    533 
    534   tran_high_t x0 = input[7];
    535   tran_high_t x1 = input[0];
    536   tran_high_t x2 = input[5];
    537   tran_high_t x3 = input[2];
    538   tran_high_t x4 = input[3];
    539   tran_high_t x5 = input[4];
    540   tran_high_t x6 = input[1];
    541   tran_high_t x7 = input[6];
    542 
    543   // stage 1
    544   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
    545   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
    546   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    547   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    548   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    549   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    550   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
    551   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
    552 
    553   x0 = fdct_round_shift(s0 + s4);
    554   x1 = fdct_round_shift(s1 + s5);
    555   x2 = fdct_round_shift(s2 + s6);
    556   x3 = fdct_round_shift(s3 + s7);
    557   x4 = fdct_round_shift(s0 - s4);
    558   x5 = fdct_round_shift(s1 - s5);
    559   x6 = fdct_round_shift(s2 - s6);
    560   x7 = fdct_round_shift(s3 - s7);
    561 
    562   // stage 2
    563   s0 = x0;
    564   s1 = x1;
    565   s2 = x2;
    566   s3 = x3;
    567   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
    568   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
    569   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
    570   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
    571 
    572   x0 = s0 + s2;
    573   x1 = s1 + s3;
    574   x2 = s0 - s2;
    575   x3 = s1 - s3;
    576   x4 = fdct_round_shift(s4 + s6);
    577   x5 = fdct_round_shift(s5 + s7);
    578   x6 = fdct_round_shift(s4 - s6);
    579   x7 = fdct_round_shift(s5 - s7);
    580 
    581   // stage 3
    582   s2 = cospi_16_64 * (x2 + x3);
    583   s3 = cospi_16_64 * (x2 - x3);
    584   s6 = cospi_16_64 * (x6 + x7);
    585   s7 = cospi_16_64 * (x6 - x7);
    586 
    587   x2 = fdct_round_shift(s2);
    588   x3 = fdct_round_shift(s3);
    589   x6 = fdct_round_shift(s6);
    590   x7 = fdct_round_shift(s7);
    591 
    592   output[0] =   x0;
    593   output[1] = - x4;
    594   output[2] =   x6;
    595   output[3] = - x2;
    596   output[4] =   x3;
    597   output[5] = - x7;
    598   output[6] =   x5;
    599   output[7] = - x1;
    600 }
    601 
    602 static const transform_2d FHT_8[] = {
    603   { fdct8,  fdct8  },  // DCT_DCT  = 0
    604   { fadst8, fdct8  },  // ADST_DCT = 1
    605   { fdct8,  fadst8 },  // DCT_ADST = 2
    606   { fadst8, fadst8 }   // ADST_ADST = 3
    607 };
    608 
    609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
    610                   int stride, int tx_type) {
    611   if (tx_type == DCT_DCT) {
    612     vp9_fdct8x8_c(input, output, stride);
    613   } else {
    614     tran_low_t out[64];
    615     tran_low_t *outptr = &out[0];
    616     int i, j;
    617     tran_low_t temp_in[8], temp_out[8];
    618     const transform_2d ht = FHT_8[tx_type];
    619 
    620     // Columns
    621     for (i = 0; i < 8; ++i) {
    622       for (j = 0; j < 8; ++j)
    623         temp_in[j] = input[j * stride + i] * 4;
    624       ht.cols(temp_in, temp_out);
    625       for (j = 0; j < 8; ++j)
    626         outptr[j * 8 + i] = temp_out[j];
    627     }
    628 
    629     // Rows
    630     for (i = 0; i < 8; ++i) {
    631       for (j = 0; j < 8; ++j)
    632         temp_in[j] = out[j + i * 8];
    633       ht.rows(temp_in, temp_out);
    634       for (j = 0; j < 8; ++j)
    635         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    636     }
    637   }
    638 }
    639 
    640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    641    pixel. */
    642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
    643   int i;
    644   tran_high_t a1, b1, c1, d1, e1;
    645   const int16_t *ip_pass0 = input;
    646   const tran_low_t *ip = NULL;
    647   tran_low_t *op = output;
    648 
    649   for (i = 0; i < 4; i++) {
    650     a1 = ip_pass0[0 * stride];
    651     b1 = ip_pass0[1 * stride];
    652     c1 = ip_pass0[2 * stride];
    653     d1 = ip_pass0[3 * stride];
    654 
    655     a1 += b1;
    656     d1 = d1 - c1;
    657     e1 = (a1 - d1) >> 1;
    658     b1 = e1 - b1;
    659     c1 = e1 - c1;
    660     a1 -= c1;
    661     d1 += b1;
    662     op[0] = a1;
    663     op[4] = c1;
    664     op[8] = d1;
    665     op[12] = b1;
    666 
    667     ip_pass0++;
    668     op++;
    669   }
    670   ip = output;
    671   op = output;
    672 
    673   for (i = 0; i < 4; i++) {
    674     a1 = ip[0];
    675     b1 = ip[1];
    676     c1 = ip[2];
    677     d1 = ip[3];
    678 
    679     a1 += b1;
    680     d1 -= c1;
    681     e1 = (a1 - d1) >> 1;
    682     b1 = e1 - b1;
    683     c1 = e1 - c1;
    684     a1 -= c1;
    685     d1 += b1;
    686     op[0] = a1 * UNIT_QUANT_FACTOR;
    687     op[1] = c1 * UNIT_QUANT_FACTOR;
    688     op[2] = d1 * UNIT_QUANT_FACTOR;
    689     op[3] = b1 * UNIT_QUANT_FACTOR;
    690 
    691     ip += 4;
    692     op += 4;
    693   }
    694 }
    695 
    696 // Rewrote to use same algorithm as others.
    697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
    698   tran_high_t step1[8];      // canbe16
    699   tran_high_t step2[8];      // canbe16
    700   tran_high_t step3[8];      // canbe16
    701   tran_high_t input[8];      // canbe16
    702   tran_high_t temp1, temp2;  // needs32
    703 
    704   // step 1
    705   input[0] = in[0] + in[15];
    706   input[1] = in[1] + in[14];
    707   input[2] = in[2] + in[13];
    708   input[3] = in[3] + in[12];
    709   input[4] = in[4] + in[11];
    710   input[5] = in[5] + in[10];
    711   input[6] = in[6] + in[ 9];
    712   input[7] = in[7] + in[ 8];
    713 
    714   step1[0] = in[7] - in[ 8];
    715   step1[1] = in[6] - in[ 9];
    716   step1[2] = in[5] - in[10];
    717   step1[3] = in[4] - in[11];
    718   step1[4] = in[3] - in[12];
    719   step1[5] = in[2] - in[13];
    720   step1[6] = in[1] - in[14];
    721   step1[7] = in[0] - in[15];
    722 
    723   // fdct8(step, step);
    724   {
    725     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    726     tran_high_t t0, t1, t2, t3;                  // needs32
    727     tran_high_t x0, x1, x2, x3;                  // canbe16
    728 
    729     // stage 1
    730     s0 = input[0] + input[7];
    731     s1 = input[1] + input[6];
    732     s2 = input[2] + input[5];
    733     s3 = input[3] + input[4];
    734     s4 = input[3] - input[4];
    735     s5 = input[2] - input[5];
    736     s6 = input[1] - input[6];
    737     s7 = input[0] - input[7];
    738 
    739     // fdct4(step, step);
    740     x0 = s0 + s3;
    741     x1 = s1 + s2;
    742     x2 = s1 - s2;
    743     x3 = s0 - s3;
    744     t0 = (x0 + x1) * cospi_16_64;
    745     t1 = (x0 - x1) * cospi_16_64;
    746     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    747     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    748     out[0] = fdct_round_shift(t0);
    749     out[4] = fdct_round_shift(t2);
    750     out[8] = fdct_round_shift(t1);
    751     out[12] = fdct_round_shift(t3);
    752 
    753     // Stage 2
    754     t0 = (s6 - s5) * cospi_16_64;
    755     t1 = (s6 + s5) * cospi_16_64;
    756     t2 = fdct_round_shift(t0);
    757     t3 = fdct_round_shift(t1);
    758 
    759     // Stage 3
    760     x0 = s4 + t2;
    761     x1 = s4 - t2;
    762     x2 = s7 - t3;
    763     x3 = s7 + t3;
    764 
    765     // Stage 4
    766     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    767     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    768     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    769     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    770     out[2] = fdct_round_shift(t0);
    771     out[6] = fdct_round_shift(t2);
    772     out[10] = fdct_round_shift(t1);
    773     out[14] = fdct_round_shift(t3);
    774   }
    775 
    776   // step 2
    777   temp1 = (step1[5] - step1[2]) * cospi_16_64;
    778   temp2 = (step1[4] - step1[3]) * cospi_16_64;
    779   step2[2] = fdct_round_shift(temp1);
    780   step2[3] = fdct_round_shift(temp2);
    781   temp1 = (step1[4] + step1[3]) * cospi_16_64;
    782   temp2 = (step1[5] + step1[2]) * cospi_16_64;
    783   step2[4] = fdct_round_shift(temp1);
    784   step2[5] = fdct_round_shift(temp2);
    785 
    786   // step 3
    787   step3[0] = step1[0] + step2[3];
    788   step3[1] = step1[1] + step2[2];
    789   step3[2] = step1[1] - step2[2];
    790   step3[3] = step1[0] - step2[3];
    791   step3[4] = step1[7] - step2[4];
    792   step3[5] = step1[6] - step2[5];
    793   step3[6] = step1[6] + step2[5];
    794   step3[7] = step1[7] + step2[4];
    795 
    796   // step 4
    797   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
    798   temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
    799   step2[1] = fdct_round_shift(temp1);
    800   step2[2] = fdct_round_shift(temp2);
    801   temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
    802   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
    803   step2[5] = fdct_round_shift(temp1);
    804   step2[6] = fdct_round_shift(temp2);
    805 
    806   // step 5
    807   step1[0] = step3[0] + step2[1];
    808   step1[1] = step3[0] - step2[1];
    809   step1[2] = step3[3] + step2[2];
    810   step1[3] = step3[3] - step2[2];
    811   step1[4] = step3[4] - step2[5];
    812   step1[5] = step3[4] + step2[5];
    813   step1[6] = step3[7] - step2[6];
    814   step1[7] = step3[7] + step2[6];
    815 
    816   // step 6
    817   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
    818   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
    819   out[1] = fdct_round_shift(temp1);
    820   out[9] = fdct_round_shift(temp2);
    821 
    822   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
    823   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
    824   out[5] = fdct_round_shift(temp1);
    825   out[13] = fdct_round_shift(temp2);
    826 
    827   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
    828   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
    829   out[3] = fdct_round_shift(temp1);
    830   out[11] = fdct_round_shift(temp2);
    831 
    832   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
    833   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
    834   out[7] = fdct_round_shift(temp1);
    835   out[15] = fdct_round_shift(temp2);
    836 }
    837 
    838 static void fadst16(const tran_low_t *input, tran_low_t *output) {
    839   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    840   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    841 
    842   tran_high_t x0 = input[15];
    843   tran_high_t x1 = input[0];
    844   tran_high_t x2 = input[13];
    845   tran_high_t x3 = input[2];
    846   tran_high_t x4 = input[11];
    847   tran_high_t x5 = input[4];
    848   tran_high_t x6 = input[9];
    849   tran_high_t x7 = input[6];
    850   tran_high_t x8 = input[7];
    851   tran_high_t x9 = input[8];
    852   tran_high_t x10 = input[5];
    853   tran_high_t x11 = input[10];
    854   tran_high_t x12 = input[3];
    855   tran_high_t x13 = input[12];
    856   tran_high_t x14 = input[1];
    857   tran_high_t x15 = input[14];
    858 
    859   // stage 1
    860   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
    861   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    862   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
    863   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    864   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
    865   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    866   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    867   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    868   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    869   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    870   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    871   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    872   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    873   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
    874   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    875   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
    876 
    877   x0 = fdct_round_shift(s0 + s8);
    878   x1 = fdct_round_shift(s1 + s9);
    879   x2 = fdct_round_shift(s2 + s10);
    880   x3 = fdct_round_shift(s3 + s11);
    881   x4 = fdct_round_shift(s4 + s12);
    882   x5 = fdct_round_shift(s5 + s13);
    883   x6 = fdct_round_shift(s6 + s14);
    884   x7 = fdct_round_shift(s7 + s15);
    885   x8  = fdct_round_shift(s0 - s8);
    886   x9  = fdct_round_shift(s1 - s9);
    887   x10 = fdct_round_shift(s2 - s10);
    888   x11 = fdct_round_shift(s3 - s11);
    889   x12 = fdct_round_shift(s4 - s12);
    890   x13 = fdct_round_shift(s5 - s13);
    891   x14 = fdct_round_shift(s6 - s14);
    892   x15 = fdct_round_shift(s7 - s15);
    893 
    894   // stage 2
    895   s0 = x0;
    896   s1 = x1;
    897   s2 = x2;
    898   s3 = x3;
    899   s4 = x4;
    900   s5 = x5;
    901   s6 = x6;
    902   s7 = x7;
    903   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
    904   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
    905   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
    906   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
    907   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
    908   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
    909   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
    910   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
    911 
    912   x0 = s0 + s4;
    913   x1 = s1 + s5;
    914   x2 = s2 + s6;
    915   x3 = s3 + s7;
    916   x4 = s0 - s4;
    917   x5 = s1 - s5;
    918   x6 = s2 - s6;
    919   x7 = s3 - s7;
    920   x8 = fdct_round_shift(s8 + s12);
    921   x9 = fdct_round_shift(s9 + s13);
    922   x10 = fdct_round_shift(s10 + s14);
    923   x11 = fdct_round_shift(s11 + s15);
    924   x12 = fdct_round_shift(s8 - s12);
    925   x13 = fdct_round_shift(s9 - s13);
    926   x14 = fdct_round_shift(s10 - s14);
    927   x15 = fdct_round_shift(s11 - s15);
    928 
    929   // stage 3
    930   s0 = x0;
    931   s1 = x1;
    932   s2 = x2;
    933   s3 = x3;
    934   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
    935   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    936   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
    937   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
    938   s8 = x8;
    939   s9 = x9;
    940   s10 = x10;
    941   s11 = x11;
    942   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
    943   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    944   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
    945   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
    946 
    947   x0 = s0 + s2;
    948   x1 = s1 + s3;
    949   x2 = s0 - s2;
    950   x3 = s1 - s3;
    951   x4 = fdct_round_shift(s4 + s6);
    952   x5 = fdct_round_shift(s5 + s7);
    953   x6 = fdct_round_shift(s4 - s6);
    954   x7 = fdct_round_shift(s5 - s7);
    955   x8 = s8 + s10;
    956   x9 = s9 + s11;
    957   x10 = s8 - s10;
    958   x11 = s9 - s11;
    959   x12 = fdct_round_shift(s12 + s14);
    960   x13 = fdct_round_shift(s13 + s15);
    961   x14 = fdct_round_shift(s12 - s14);
    962   x15 = fdct_round_shift(s13 - s15);
    963 
    964   // stage 4
    965   s2 = (- cospi_16_64) * (x2 + x3);
    966   s3 = cospi_16_64 * (x2 - x3);
    967   s6 = cospi_16_64 * (x6 + x7);
    968   s7 = cospi_16_64 * (- x6 + x7);
    969   s10 = cospi_16_64 * (x10 + x11);
    970   s11 = cospi_16_64 * (- x10 + x11);
    971   s14 = (- cospi_16_64) * (x14 + x15);
    972   s15 = cospi_16_64 * (x14 - x15);
    973 
    974   x2 = fdct_round_shift(s2);
    975   x3 = fdct_round_shift(s3);
    976   x6 = fdct_round_shift(s6);
    977   x7 = fdct_round_shift(s7);
    978   x10 = fdct_round_shift(s10);
    979   x11 = fdct_round_shift(s11);
    980   x14 = fdct_round_shift(s14);
    981   x15 = fdct_round_shift(s15);
    982 
    983   output[0] = x0;
    984   output[1] = - x8;
    985   output[2] = x12;
    986   output[3] = - x4;
    987   output[4] = x6;
    988   output[5] = x14;
    989   output[6] = x10;
    990   output[7] = x2;
    991   output[8] = x3;
    992   output[9] =  x11;
    993   output[10] = x15;
    994   output[11] = x7;
    995   output[12] = x5;
    996   output[13] = - x13;
    997   output[14] = x9;
    998   output[15] = - x1;
    999 }
   1000 
   1001 static const transform_2d FHT_16[] = {
   1002   { fdct16,  fdct16  },  // DCT_DCT  = 0
   1003   { fadst16, fdct16  },  // ADST_DCT = 1
   1004   { fdct16,  fadst16 },  // DCT_ADST = 2
   1005   { fadst16, fadst16 }   // ADST_ADST = 3
   1006 };
   1007 
   1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
   1009                     int stride, int tx_type) {
   1010   if (tx_type == DCT_DCT) {
   1011     vp9_fdct16x16_c(input, output, stride);
   1012   } else {
   1013     tran_low_t out[256];
   1014     tran_low_t *outptr = &out[0];
   1015     int i, j;
   1016     tran_low_t temp_in[16], temp_out[16];
   1017     const transform_2d ht = FHT_16[tx_type];
   1018 
   1019     // Columns
   1020     for (i = 0; i < 16; ++i) {
   1021       for (j = 0; j < 16; ++j)
   1022         temp_in[j] = input[j * stride + i] * 4;
   1023       ht.cols(temp_in, temp_out);
   1024       for (j = 0; j < 16; ++j)
   1025         outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   1026     }
   1027 
   1028     // Rows
   1029     for (i = 0; i < 16; ++i) {
   1030       for (j = 0; j < 16; ++j)
   1031         temp_in[j] = out[j + i * 16];
   1032       ht.rows(temp_in, temp_out);
   1033       for (j = 0; j < 16; ++j)
   1034         output[j + i * 16] = temp_out[j];
   1035     }
   1036   }
   1037 }
   1038 
   1039 static INLINE tran_high_t dct_32_round(tran_high_t input) {
   1040   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   1041   // TODO(debargha, peter.derivaz): Find new bounds for this assert,
   1042   // and make the bounds consts.
   1043   // assert(-131072 <= rv && rv <= 131071);
   1044   return rv;
   1045 }
   1046 
   1047 static INLINE tran_high_t half_round_shift(tran_high_t input) {
   1048   tran_high_t rv = (input + 1 + (input < 0)) >> 2;
   1049   return rv;
   1050 }
   1051 
   1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
   1053   tran_high_t step[32];
   1054   // Stage 1
   1055   step[0] = input[0] + input[(32 - 1)];
   1056   step[1] = input[1] + input[(32 - 2)];
   1057   step[2] = input[2] + input[(32 - 3)];
   1058   step[3] = input[3] + input[(32 - 4)];
   1059   step[4] = input[4] + input[(32 - 5)];
   1060   step[5] = input[5] + input[(32 - 6)];
   1061   step[6] = input[6] + input[(32 - 7)];
   1062   step[7] = input[7] + input[(32 - 8)];
   1063   step[8] = input[8] + input[(32 - 9)];
   1064   step[9] = input[9] + input[(32 - 10)];
   1065   step[10] = input[10] + input[(32 - 11)];
   1066   step[11] = input[11] + input[(32 - 12)];
   1067   step[12] = input[12] + input[(32 - 13)];
   1068   step[13] = input[13] + input[(32 - 14)];
   1069   step[14] = input[14] + input[(32 - 15)];
   1070   step[15] = input[15] + input[(32 - 16)];
   1071   step[16] = -input[16] + input[(32 - 17)];
   1072   step[17] = -input[17] + input[(32 - 18)];
   1073   step[18] = -input[18] + input[(32 - 19)];
   1074   step[19] = -input[19] + input[(32 - 20)];
   1075   step[20] = -input[20] + input[(32 - 21)];
   1076   step[21] = -input[21] + input[(32 - 22)];
   1077   step[22] = -input[22] + input[(32 - 23)];
   1078   step[23] = -input[23] + input[(32 - 24)];
   1079   step[24] = -input[24] + input[(32 - 25)];
   1080   step[25] = -input[25] + input[(32 - 26)];
   1081   step[26] = -input[26] + input[(32 - 27)];
   1082   step[27] = -input[27] + input[(32 - 28)];
   1083   step[28] = -input[28] + input[(32 - 29)];
   1084   step[29] = -input[29] + input[(32 - 30)];
   1085   step[30] = -input[30] + input[(32 - 31)];
   1086   step[31] = -input[31] + input[(32 - 32)];
   1087 
   1088   // Stage 2
   1089   output[0] = step[0] + step[16 - 1];
   1090   output[1] = step[1] + step[16 - 2];
   1091   output[2] = step[2] + step[16 - 3];
   1092   output[3] = step[3] + step[16 - 4];
   1093   output[4] = step[4] + step[16 - 5];
   1094   output[5] = step[5] + step[16 - 6];
   1095   output[6] = step[6] + step[16 - 7];
   1096   output[7] = step[7] + step[16 - 8];
   1097   output[8] = -step[8] + step[16 - 9];
   1098   output[9] = -step[9] + step[16 - 10];
   1099   output[10] = -step[10] + step[16 - 11];
   1100   output[11] = -step[11] + step[16 - 12];
   1101   output[12] = -step[12] + step[16 - 13];
   1102   output[13] = -step[13] + step[16 - 14];
   1103   output[14] = -step[14] + step[16 - 15];
   1104   output[15] = -step[15] + step[16 - 16];
   1105 
   1106   output[16] = step[16];
   1107   output[17] = step[17];
   1108   output[18] = step[18];
   1109   output[19] = step[19];
   1110 
   1111   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
   1112   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
   1113   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
   1114   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
   1115 
   1116   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
   1117   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
   1118   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
   1119   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
   1120 
   1121   output[28] = step[28];
   1122   output[29] = step[29];
   1123   output[30] = step[30];
   1124   output[31] = step[31];
   1125 
   1126   // dump the magnitude by 4, hence the intermediate values are within
   1127   // the range of 16 bits.
   1128   if (round) {
   1129     output[0] = half_round_shift(output[0]);
   1130     output[1] = half_round_shift(output[1]);
   1131     output[2] = half_round_shift(output[2]);
   1132     output[3] = half_round_shift(output[3]);
   1133     output[4] = half_round_shift(output[4]);
   1134     output[5] = half_round_shift(output[5]);
   1135     output[6] = half_round_shift(output[6]);
   1136     output[7] = half_round_shift(output[7]);
   1137     output[8] = half_round_shift(output[8]);
   1138     output[9] = half_round_shift(output[9]);
   1139     output[10] = half_round_shift(output[10]);
   1140     output[11] = half_round_shift(output[11]);
   1141     output[12] = half_round_shift(output[12]);
   1142     output[13] = half_round_shift(output[13]);
   1143     output[14] = half_round_shift(output[14]);
   1144     output[15] = half_round_shift(output[15]);
   1145 
   1146     output[16] = half_round_shift(output[16]);
   1147     output[17] = half_round_shift(output[17]);
   1148     output[18] = half_round_shift(output[18]);
   1149     output[19] = half_round_shift(output[19]);
   1150     output[20] = half_round_shift(output[20]);
   1151     output[21] = half_round_shift(output[21]);
   1152     output[22] = half_round_shift(output[22]);
   1153     output[23] = half_round_shift(output[23]);
   1154     output[24] = half_round_shift(output[24]);
   1155     output[25] = half_round_shift(output[25]);
   1156     output[26] = half_round_shift(output[26]);
   1157     output[27] = half_round_shift(output[27]);
   1158     output[28] = half_round_shift(output[28]);
   1159     output[29] = half_round_shift(output[29]);
   1160     output[30] = half_round_shift(output[30]);
   1161     output[31] = half_round_shift(output[31]);
   1162   }
   1163 
   1164   // Stage 3
   1165   step[0] = output[0] + output[(8 - 1)];
   1166   step[1] = output[1] + output[(8 - 2)];
   1167   step[2] = output[2] + output[(8 - 3)];
   1168   step[3] = output[3] + output[(8 - 4)];
   1169   step[4] = -output[4] + output[(8 - 5)];
   1170   step[5] = -output[5] + output[(8 - 6)];
   1171   step[6] = -output[6] + output[(8 - 7)];
   1172   step[7] = -output[7] + output[(8 - 8)];
   1173   step[8] = output[8];
   1174   step[9] = output[9];
   1175   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
   1176   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
   1177   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
   1178   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
   1179   step[14] = output[14];
   1180   step[15] = output[15];
   1181 
   1182   step[16] = output[16] + output[23];
   1183   step[17] = output[17] + output[22];
   1184   step[18] = output[18] + output[21];
   1185   step[19] = output[19] + output[20];
   1186   step[20] = -output[20] + output[19];
   1187   step[21] = -output[21] + output[18];
   1188   step[22] = -output[22] + output[17];
   1189   step[23] = -output[23] + output[16];
   1190   step[24] = -output[24] + output[31];
   1191   step[25] = -output[25] + output[30];
   1192   step[26] = -output[26] + output[29];
   1193   step[27] = -output[27] + output[28];
   1194   step[28] = output[28] + output[27];
   1195   step[29] = output[29] + output[26];
   1196   step[30] = output[30] + output[25];
   1197   step[31] = output[31] + output[24];
   1198 
   1199   // Stage 4
   1200   output[0] = step[0] + step[3];
   1201   output[1] = step[1] + step[2];
   1202   output[2] = -step[2] + step[1];
   1203   output[3] = -step[3] + step[0];
   1204   output[4] = step[4];
   1205   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
   1206   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
   1207   output[7] = step[7];
   1208   output[8] = step[8] + step[11];
   1209   output[9] = step[9] + step[10];
   1210   output[10] = -step[10] + step[9];
   1211   output[11] = -step[11] + step[8];
   1212   output[12] = -step[12] + step[15];
   1213   output[13] = -step[13] + step[14];
   1214   output[14] = step[14] + step[13];
   1215   output[15] = step[15] + step[12];
   1216 
   1217   output[16] = step[16];
   1218   output[17] = step[17];
   1219   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
   1220   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
   1221   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
   1222   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
   1223   output[22] = step[22];
   1224   output[23] = step[23];
   1225   output[24] = step[24];
   1226   output[25] = step[25];
   1227   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
   1228   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
   1229   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
   1230   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
   1231   output[30] = step[30];
   1232   output[31] = step[31];
   1233 
   1234   // Stage 5
   1235   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
   1236   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
   1237   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
   1238   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
   1239   step[4] = output[4] + output[5];
   1240   step[5] = -output[5] + output[4];
   1241   step[6] = -output[6] + output[7];
   1242   step[7] = output[7] + output[6];
   1243   step[8] = output[8];
   1244   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
   1245   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
   1246   step[11] = output[11];
   1247   step[12] = output[12];
   1248   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
   1249   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
   1250   step[15] = output[15];
   1251 
   1252   step[16] = output[16] + output[19];
   1253   step[17] = output[17] + output[18];
   1254   step[18] = -output[18] + output[17];
   1255   step[19] = -output[19] + output[16];
   1256   step[20] = -output[20] + output[23];
   1257   step[21] = -output[21] + output[22];
   1258   step[22] = output[22] + output[21];
   1259   step[23] = output[23] + output[20];
   1260   step[24] = output[24] + output[27];
   1261   step[25] = output[25] + output[26];
   1262   step[26] = -output[26] + output[25];
   1263   step[27] = -output[27] + output[24];
   1264   step[28] = -output[28] + output[31];
   1265   step[29] = -output[29] + output[30];
   1266   step[30] = output[30] + output[29];
   1267   step[31] = output[31] + output[28];
   1268 
   1269   // Stage 6
   1270   output[0] = step[0];
   1271   output[1] = step[1];
   1272   output[2] = step[2];
   1273   output[3] = step[3];
   1274   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
   1275   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
   1276   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
   1277   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
   1278   output[8] = step[8] + step[9];
   1279   output[9] = -step[9] + step[8];
   1280   output[10] = -step[10] + step[11];
   1281   output[11] = step[11] + step[10];
   1282   output[12] = step[12] + step[13];
   1283   output[13] = -step[13] + step[12];
   1284   output[14] = -step[14] + step[15];
   1285   output[15] = step[15] + step[14];
   1286 
   1287   output[16] = step[16];
   1288   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
   1289   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
   1290   output[19] = step[19];
   1291   output[20] = step[20];
   1292   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
   1293   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
   1294   output[23] = step[23];
   1295   output[24] = step[24];
   1296   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
   1297   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
   1298   output[27] = step[27];
   1299   output[28] = step[28];
   1300   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
   1301   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
   1302   output[31] = step[31];
   1303 
   1304   // Stage 7
   1305   step[0] = output[0];
   1306   step[1] = output[1];
   1307   step[2] = output[2];
   1308   step[3] = output[3];
   1309   step[4] = output[4];
   1310   step[5] = output[5];
   1311   step[6] = output[6];
   1312   step[7] = output[7];
   1313   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
   1314   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
   1315   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
   1316   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
   1317   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
   1318   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
   1319   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
   1320   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
   1321 
   1322   step[16] = output[16] + output[17];
   1323   step[17] = -output[17] + output[16];
   1324   step[18] = -output[18] + output[19];
   1325   step[19] = output[19] + output[18];
   1326   step[20] = output[20] + output[21];
   1327   step[21] = -output[21] + output[20];
   1328   step[22] = -output[22] + output[23];
   1329   step[23] = output[23] + output[22];
   1330   step[24] = output[24] + output[25];
   1331   step[25] = -output[25] + output[24];
   1332   step[26] = -output[26] + output[27];
   1333   step[27] = output[27] + output[26];
   1334   step[28] = output[28] + output[29];
   1335   step[29] = -output[29] + output[28];
   1336   step[30] = -output[30] + output[31];
   1337   step[31] = output[31] + output[30];
   1338 
   1339   // Final stage --- outputs indices are bit-reversed.
   1340   output[0]  = step[0];
   1341   output[16] = step[1];
   1342   output[8]  = step[2];
   1343   output[24] = step[3];
   1344   output[4]  = step[4];
   1345   output[20] = step[5];
   1346   output[12] = step[6];
   1347   output[28] = step[7];
   1348   output[2]  = step[8];
   1349   output[18] = step[9];
   1350   output[10] = step[10];
   1351   output[26] = step[11];
   1352   output[6]  = step[12];
   1353   output[22] = step[13];
   1354   output[14] = step[14];
   1355   output[30] = step[15];
   1356 
   1357   output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
   1358   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
   1359   output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
   1360   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
   1361   output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
   1362   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
   1363   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
   1364   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
   1365   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
   1366   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
   1367   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
   1368   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
   1369   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
   1370   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
   1371   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
   1372   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
   1373 }
   1374 
   1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
   1376   int r, c;
   1377   tran_low_t sum = 0;
   1378   for (r = 0; r < 32; ++r)
   1379     for (c = 0; c < 32; ++c)
   1380       sum += input[r * stride + c];
   1381 
   1382   output[0] = sum >> 3;
   1383   output[1] = 0;
   1384 }
   1385 
   1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   1387   int i, j;
   1388   tran_high_t output[32 * 32];
   1389 
   1390   // Columns
   1391   for (i = 0; i < 32; ++i) {
   1392     tran_high_t temp_in[32], temp_out[32];
   1393     for (j = 0; j < 32; ++j)
   1394       temp_in[j] = input[j * stride + i] * 4;
   1395     fdct32(temp_in, temp_out, 0);
   1396     for (j = 0; j < 32; ++j)
   1397       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   1398   }
   1399 
   1400   // Rows
   1401   for (i = 0; i < 32; ++i) {
   1402     tran_high_t temp_in[32], temp_out[32];
   1403     for (j = 0; j < 32; ++j)
   1404       temp_in[j] = output[j + i * 32];
   1405     fdct32(temp_in, temp_out, 0);
   1406     for (j = 0; j < 32; ++j)
   1407       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   1408   }
   1409 }
   1410 
   1411 // Note that although we use dct_32_round in dct32 computation flow,
   1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating
   1413 // within 16 bits precision.
   1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
   1415   int i, j;
   1416   tran_high_t output[32 * 32];
   1417 
   1418   // Columns
   1419   for (i = 0; i < 32; ++i) {
   1420     tran_high_t temp_in[32], temp_out[32];
   1421     for (j = 0; j < 32; ++j)
   1422       temp_in[j] = input[j * stride + i] * 4;
   1423     fdct32(temp_in, temp_out, 0);
   1424     for (j = 0; j < 32; ++j)
   1425       // TODO(cd): see quality impact of only doing
   1426       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
   1427       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
   1428       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   1429   }
   1430 
   1431   // Rows
   1432   for (i = 0; i < 32; ++i) {
   1433     tran_high_t temp_in[32], temp_out[32];
   1434     for (j = 0; j < 32; ++j)
   1435       temp_in[j] = output[j + i * 32];
   1436     fdct32(temp_in, temp_out, 1);
   1437     for (j = 0; j < 32; ++j)
   1438       out[j + i * 32] = temp_out[j];
   1439   }
   1440 }
   1441 
   1442 #if CONFIG_VP9_HIGHBITDEPTH
   1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   1444   vp9_fdct4x4_c(input, output, stride);
   1445 }
   1446 
   1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
   1448                        int stride, int tx_type) {
   1449   vp9_fht4x4_c(input, output, stride, tx_type);
   1450 }
   1451 
   1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
   1453                           int stride) {
   1454   vp9_fdct8x8_1_c(input, final_output, stride);
   1455 }
   1456 
   1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
   1458                         int stride) {
   1459   vp9_fdct8x8_c(input, final_output, stride);
   1460 }
   1461 
   1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
   1463                             int stride) {
   1464   vp9_fdct16x16_1_c(input, output, stride);
   1465 }
   1466 
   1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
   1468                           int stride) {
   1469   vp9_fdct16x16_c(input, output, stride);
   1470 }
   1471 
   1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
   1473                   int stride, int tx_type) {
   1474   vp9_fht8x8_c(input, output, stride, tx_type);
   1475 }
   1476 
   1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   1478   vp9_fwht4x4_c(input, output, stride);
   1479 }
   1480 
   1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
   1482                     int stride, int tx_type) {
   1483   vp9_fht16x16_c(input, output, stride, tx_type);
   1484 }
   1485 
   1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
   1487   vp9_fdct32x32_1_c(input, out, stride);
   1488 }
   1489 
   1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   1491   vp9_fdct32x32_c(input, out, stride);
   1492 }
   1493 
   1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
   1495                              int stride) {
   1496   vp9_fdct32x32_rd_c(input, out, stride);
   1497 }
   1498 #endif  // CONFIG_VP9_HIGHBITDEPTH
   1499