Home | History | Annotate | Download | only in common
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <math.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_systemdependent.h"
     17 #include "vp9/common/vp9_blockd.h"
     18 #include "vp9/common/vp9_common.h"
     19 #include "vp9/common/vp9_idct.h"
     20 
     21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
     22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
     23 // overflow wrapping to match expected hardware implementations.
     24 // bd of 8 uses trans_low with 16bits, need to remove 16bits
     25 // bd of 10 uses trans_low with 18bits, need to remove 14bits
     26 // bd of 12 uses trans_low with 20bits, need to remove 12bits
     27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
     28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
     29 #else
     30 #define WRAPLOW(x) (x)
     31 #endif  // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
     32 
     33 #if CONFIG_VP9_HIGHBITDEPTH
     34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
     35                                     tran_low_t high) {
     36   return value < low ? low : (value > high ? high : value);
     37 }
     38 
     39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
     40                                             tran_high_t trans, int bd) {
     41   trans = WRAPLOW(trans);
     42   switch (bd) {
     43     case 8:
     44     default:
     45       return clamp_high(WRAPLOW(dest + trans), 0, 255);
     46     case 10:
     47       return clamp_high(WRAPLOW(dest + trans), 0, 1023);
     48     case 12:
     49       return clamp_high(WRAPLOW(dest + trans), 0, 4095);
     50   }
     51 }
     52 #endif  // CONFIG_VP9_HIGHBITDEPTH
     53 
     54 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     56    0.5 shifts per pixel. */
     57   int i;
     58   tran_low_t output[16];
     59   tran_high_t a1, b1, c1, d1, e1;
     60   const tran_low_t *ip = input;
     61   tran_low_t *op = output;
     62 
     63   for (i = 0; i < 4; i++) {
     64     a1 = ip[0] >> UNIT_QUANT_SHIFT;
     65     c1 = ip[1] >> UNIT_QUANT_SHIFT;
     66     d1 = ip[2] >> UNIT_QUANT_SHIFT;
     67     b1 = ip[3] >> UNIT_QUANT_SHIFT;
     68     a1 += c1;
     69     d1 -= b1;
     70     e1 = (a1 - d1) >> 1;
     71     b1 = e1 - b1;
     72     c1 = e1 - c1;
     73     a1 -= b1;
     74     d1 += c1;
     75     op[0] = a1;
     76     op[1] = b1;
     77     op[2] = c1;
     78     op[3] = d1;
     79     ip += 4;
     80     op += 4;
     81   }
     82 
     83   ip = output;
     84   for (i = 0; i < 4; i++) {
     85     a1 = ip[4 * 0];
     86     c1 = ip[4 * 1];
     87     d1 = ip[4 * 2];
     88     b1 = ip[4 * 3];
     89     a1 += c1;
     90     d1 -= b1;
     91     e1 = (a1 - d1) >> 1;
     92     b1 = e1 - b1;
     93     c1 = e1 - c1;
     94     a1 -= b1;
     95     d1 += c1;
     96     dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
     97     dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
     98     dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
     99     dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
    100 
    101     ip++;
    102     dest++;
    103   }
    104 }
    105 
    106 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
    107   int i;
    108   tran_high_t a1, e1;
    109   tran_low_t tmp[4];
    110   const tran_low_t *ip = in;
    111   tran_low_t *op = tmp;
    112 
    113   a1 = ip[0] >> UNIT_QUANT_SHIFT;
    114   e1 = a1 >> 1;
    115   a1 -= e1;
    116   op[0] = a1;
    117   op[1] = op[2] = op[3] = e1;
    118 
    119   ip = tmp;
    120   for (i = 0; i < 4; i++) {
    121     e1 = ip[0] >> 1;
    122     a1 = ip[0] - e1;
    123     dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
    124     dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
    125     dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
    126     dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
    127     ip++;
    128     dest++;
    129   }
    130 }
    131 
    132 static void idct4(const tran_low_t *input, tran_low_t *output) {
    133   tran_low_t step[4];
    134   tran_high_t temp1, temp2;
    135   // stage 1
    136   temp1 = (input[0] + input[2]) * cospi_16_64;
    137   temp2 = (input[0] - input[2]) * cospi_16_64;
    138   step[0] = dct_const_round_shift(temp1);
    139   step[1] = dct_const_round_shift(temp2);
    140   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
    141   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
    142   step[2] = dct_const_round_shift(temp1);
    143   step[3] = dct_const_round_shift(temp2);
    144 
    145   // stage 2
    146   output[0] = step[0] + step[3];
    147   output[1] = step[1] + step[2];
    148   output[2] = step[1] - step[2];
    149   output[3] = step[0] - step[3];
    150 }
    151 
    152 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    153   tran_low_t out[4 * 4];
    154   tran_low_t *outptr = out;
    155   int i, j;
    156   tran_low_t temp_in[4], temp_out[4];
    157 
    158   // Rows
    159   for (i = 0; i < 4; ++i) {
    160     idct4(input, outptr);
    161     input += 4;
    162     outptr += 4;
    163   }
    164 
    165   // Columns
    166   for (i = 0; i < 4; ++i) {
    167     for (j = 0; j < 4; ++j)
    168       temp_in[j] = out[j * 4 + i];
    169     idct4(temp_in, temp_out);
    170     for (j = 0; j < 4; ++j)
    171       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
    172                                   + dest[j * stride + i]);
    173   }
    174 }
    175 
    176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
    177                          int dest_stride) {
    178   int i;
    179   tran_high_t a1;
    180   tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    181   out = dct_const_round_shift(out * cospi_16_64);
    182   a1 = ROUND_POWER_OF_TWO(out, 4);
    183 
    184   for (i = 0; i < 4; i++) {
    185     dest[0] = clip_pixel(dest[0] + a1);
    186     dest[1] = clip_pixel(dest[1] + a1);
    187     dest[2] = clip_pixel(dest[2] + a1);
    188     dest[3] = clip_pixel(dest[3] + a1);
    189     dest += dest_stride;
    190   }
    191 }
    192 
    193 static void idct8(const tran_low_t *input, tran_low_t *output) {
    194   tran_low_t step1[8], step2[8];
    195   tran_high_t temp1, temp2;
    196   // stage 1
    197   step1[0] = input[0];
    198   step1[2] = input[4];
    199   step1[1] = input[2];
    200   step1[3] = input[6];
    201   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    202   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    203   step1[4] = dct_const_round_shift(temp1);
    204   step1[7] = dct_const_round_shift(temp2);
    205   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    206   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    207   step1[5] = dct_const_round_shift(temp1);
    208   step1[6] = dct_const_round_shift(temp2);
    209 
    210   // stage 2 & stage 3 - even half
    211   idct4(step1, step1);
    212 
    213   // stage 2 - odd half
    214   step2[4] = step1[4] + step1[5];
    215   step2[5] = step1[4] - step1[5];
    216   step2[6] = -step1[6] + step1[7];
    217   step2[7] = step1[6] + step1[7];
    218 
    219   // stage 3 -odd half
    220   step1[4] = step2[4];
    221   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    222   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    223   step1[5] = dct_const_round_shift(temp1);
    224   step1[6] = dct_const_round_shift(temp2);
    225   step1[7] = step2[7];
    226 
    227   // stage 4
    228   output[0] = step1[0] + step1[7];
    229   output[1] = step1[1] + step1[6];
    230   output[2] = step1[2] + step1[5];
    231   output[3] = step1[3] + step1[4];
    232   output[4] = step1[3] - step1[4];
    233   output[5] = step1[2] - step1[5];
    234   output[6] = step1[1] - step1[6];
    235   output[7] = step1[0] - step1[7];
    236 }
    237 
    238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    239   tran_low_t out[8 * 8];
    240   tran_low_t *outptr = out;
    241   int i, j;
    242   tran_low_t temp_in[8], temp_out[8];
    243 
    244   // First transform rows
    245   for (i = 0; i < 8; ++i) {
    246     idct8(input, outptr);
    247     input += 8;
    248     outptr += 8;
    249   }
    250 
    251   // Then transform columns
    252   for (i = 0; i < 8; ++i) {
    253     for (j = 0; j < 8; ++j)
    254       temp_in[j] = out[j * 8 + i];
    255     idct8(temp_in, temp_out);
    256     for (j = 0; j < 8; ++j)
    257       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
    258                                   + dest[j * stride + i]);
    259   }
    260 }
    261 
    262 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    263   int i, j;
    264   tran_high_t a1;
    265   tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    266   out = dct_const_round_shift(out * cospi_16_64);
    267   a1 = ROUND_POWER_OF_TWO(out, 5);
    268   for (j = 0; j < 8; ++j) {
    269     for (i = 0; i < 8; ++i)
    270       dest[i] = clip_pixel(dest[i] + a1);
    271     dest += stride;
    272   }
    273 }
    274 
    275 static void iadst4(const tran_low_t *input, tran_low_t *output) {
    276   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    277 
    278   tran_high_t x0 = input[0];
    279   tran_high_t x1 = input[1];
    280   tran_high_t x2 = input[2];
    281   tran_high_t x3 = input[3];
    282 
    283   if (!(x0 | x1 | x2 | x3)) {
    284     output[0] = output[1] = output[2] = output[3] = 0;
    285     return;
    286   }
    287 
    288   s0 = sinpi_1_9 * x0;
    289   s1 = sinpi_2_9 * x0;
    290   s2 = sinpi_3_9 * x1;
    291   s3 = sinpi_4_9 * x2;
    292   s4 = sinpi_1_9 * x2;
    293   s5 = sinpi_2_9 * x3;
    294   s6 = sinpi_4_9 * x3;
    295   s7 = x0 - x2 + x3;
    296 
    297   x0 = s0 + s3 + s5;
    298   x1 = s1 - s4 - s6;
    299   x2 = sinpi_3_9 * s7;
    300   x3 = s2;
    301 
    302   s0 = x0 + x3;
    303   s1 = x1 + x3;
    304   s2 = x2;
    305   s3 = x0 + x1 - x3;
    306 
    307   // 1-D transform scaling factor is sqrt(2).
    308   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
    309   // + 1b (addition) = 29b.
    310   // Hence the output bit depth is 15b.
    311   output[0] = dct_const_round_shift(s0);
    312   output[1] = dct_const_round_shift(s1);
    313   output[2] = dct_const_round_shift(s2);
    314   output[3] = dct_const_round_shift(s3);
    315 }
    316 
    317 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
    318                          int tx_type) {
    319   const transform_2d IHT_4[] = {
    320     { idct4, idct4  },  // DCT_DCT  = 0
    321     { iadst4, idct4  },   // ADST_DCT = 1
    322     { idct4, iadst4 },    // DCT_ADST = 2
    323     { iadst4, iadst4 }      // ADST_ADST = 3
    324   };
    325 
    326   int i, j;
    327   tran_low_t out[4 * 4];
    328   tran_low_t *outptr = out;
    329   tran_low_t temp_in[4], temp_out[4];
    330 
    331   // inverse transform row vectors
    332   for (i = 0; i < 4; ++i) {
    333     IHT_4[tx_type].rows(input, outptr);
    334     input  += 4;
    335     outptr += 4;
    336   }
    337 
    338   // inverse transform column vectors
    339   for (i = 0; i < 4; ++i) {
    340     for (j = 0; j < 4; ++j)
    341       temp_in[j] = out[j * 4 + i];
    342     IHT_4[tx_type].cols(temp_in, temp_out);
    343     for (j = 0; j < 4; ++j)
    344       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
    345                                   + dest[j * stride + i]);
    346   }
    347 }
    348 static void iadst8(const tran_low_t *input, tran_low_t *output) {
    349   int s0, s1, s2, s3, s4, s5, s6, s7;
    350 
    351   tran_high_t x0 = input[7];
    352   tran_high_t x1 = input[0];
    353   tran_high_t x2 = input[5];
    354   tran_high_t x3 = input[2];
    355   tran_high_t x4 = input[3];
    356   tran_high_t x5 = input[4];
    357   tran_high_t x6 = input[1];
    358   tran_high_t x7 = input[6];
    359 
    360   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    361     output[0] = output[1] = output[2] = output[3] = output[4]
    362               = output[5] = output[6] = output[7] = 0;
    363     return;
    364   }
    365 
    366   // stage 1
    367   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
    368   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
    369   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    370   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    371   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    372   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    373   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
    374   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
    375 
    376   x0 = dct_const_round_shift(s0 + s4);
    377   x1 = dct_const_round_shift(s1 + s5);
    378   x2 = dct_const_round_shift(s2 + s6);
    379   x3 = dct_const_round_shift(s3 + s7);
    380   x4 = dct_const_round_shift(s0 - s4);
    381   x5 = dct_const_round_shift(s1 - s5);
    382   x6 = dct_const_round_shift(s2 - s6);
    383   x7 = dct_const_round_shift(s3 - s7);
    384 
    385   // stage 2
    386   s0 = x0;
    387   s1 = x1;
    388   s2 = x2;
    389   s3 = x3;
    390   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
    391   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
    392   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
    393   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
    394 
    395   x0 = s0 + s2;
    396   x1 = s1 + s3;
    397   x2 = s0 - s2;
    398   x3 = s1 - s3;
    399   x4 = dct_const_round_shift(s4 + s6);
    400   x5 = dct_const_round_shift(s5 + s7);
    401   x6 = dct_const_round_shift(s4 - s6);
    402   x7 = dct_const_round_shift(s5 - s7);
    403 
    404   // stage 3
    405   s2 = cospi_16_64 * (x2 + x3);
    406   s3 = cospi_16_64 * (x2 - x3);
    407   s6 = cospi_16_64 * (x6 + x7);
    408   s7 = cospi_16_64 * (x6 - x7);
    409 
    410   x2 = dct_const_round_shift(s2);
    411   x3 = dct_const_round_shift(s3);
    412   x6 = dct_const_round_shift(s6);
    413   x7 = dct_const_round_shift(s7);
    414 
    415   output[0] =  x0;
    416   output[1] = -x4;
    417   output[2] =  x6;
    418   output[3] = -x2;
    419   output[4] =  x3;
    420   output[5] = -x7;
    421   output[6] =  x5;
    422   output[7] = -x1;
    423 }
    424 
    425 static const transform_2d IHT_8[] = {
    426   { idct8,  idct8  },  // DCT_DCT  = 0
    427   { iadst8, idct8  },  // ADST_DCT = 1
    428   { idct8,  iadst8 },  // DCT_ADST = 2
    429   { iadst8, iadst8 }   // ADST_ADST = 3
    430 };
    431 
    432 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
    433                          int tx_type) {
    434   int i, j;
    435   tran_low_t out[8 * 8];
    436   tran_low_t *outptr = out;
    437   tran_low_t temp_in[8], temp_out[8];
    438   const transform_2d ht = IHT_8[tx_type];
    439 
    440   // inverse transform row vectors
    441   for (i = 0; i < 8; ++i) {
    442     ht.rows(input, outptr);
    443     input += 8;
    444     outptr += 8;
    445   }
    446 
    447   // inverse transform column vectors
    448   for (i = 0; i < 8; ++i) {
    449     for (j = 0; j < 8; ++j)
    450       temp_in[j] = out[j * 8 + i];
    451     ht.cols(temp_in, temp_out);
    452     for (j = 0; j < 8; ++j)
    453       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
    454                                   + dest[j * stride + i]);
    455   }
    456 }
    457 
    458 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    459   tran_low_t out[8 * 8] = { 0 };
    460   tran_low_t *outptr = out;
    461   int i, j;
    462   tran_low_t temp_in[8], temp_out[8];
    463 
    464   // First transform rows
    465   // only first 4 row has non-zero coefs
    466   for (i = 0; i < 4; ++i) {
    467     idct8(input, outptr);
    468     input += 8;
    469     outptr += 8;
    470   }
    471 
    472   // Then transform columns
    473   for (i = 0; i < 8; ++i) {
    474     for (j = 0; j < 8; ++j)
    475       temp_in[j] = out[j * 8 + i];
    476     idct8(temp_in, temp_out);
    477     for (j = 0; j < 8; ++j)
    478       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
    479                                   + dest[j * stride + i]);
    480   }
    481 }
    482 
    483 static void idct16(const tran_low_t *input, tran_low_t *output) {
    484   tran_low_t step1[16], step2[16];
    485   tran_high_t temp1, temp2;
    486 
    487   // stage 1
    488   step1[0] = input[0/2];
    489   step1[1] = input[16/2];
    490   step1[2] = input[8/2];
    491   step1[3] = input[24/2];
    492   step1[4] = input[4/2];
    493   step1[5] = input[20/2];
    494   step1[6] = input[12/2];
    495   step1[7] = input[28/2];
    496   step1[8] = input[2/2];
    497   step1[9] = input[18/2];
    498   step1[10] = input[10/2];
    499   step1[11] = input[26/2];
    500   step1[12] = input[6/2];
    501   step1[13] = input[22/2];
    502   step1[14] = input[14/2];
    503   step1[15] = input[30/2];
    504 
    505   // stage 2
    506   step2[0] = step1[0];
    507   step2[1] = step1[1];
    508   step2[2] = step1[2];
    509   step2[3] = step1[3];
    510   step2[4] = step1[4];
    511   step2[5] = step1[5];
    512   step2[6] = step1[6];
    513   step2[7] = step1[7];
    514 
    515   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    516   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    517   step2[8] = dct_const_round_shift(temp1);
    518   step2[15] = dct_const_round_shift(temp2);
    519 
    520   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    521   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    522   step2[9] = dct_const_round_shift(temp1);
    523   step2[14] = dct_const_round_shift(temp2);
    524 
    525   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    526   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    527   step2[10] = dct_const_round_shift(temp1);
    528   step2[13] = dct_const_round_shift(temp2);
    529 
    530   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    531   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    532   step2[11] = dct_const_round_shift(temp1);
    533   step2[12] = dct_const_round_shift(temp2);
    534 
    535   // stage 3
    536   step1[0] = step2[0];
    537   step1[1] = step2[1];
    538   step1[2] = step2[2];
    539   step1[3] = step2[3];
    540 
    541   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    542   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    543   step1[4] = dct_const_round_shift(temp1);
    544   step1[7] = dct_const_round_shift(temp2);
    545   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    546   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    547   step1[5] = dct_const_round_shift(temp1);
    548   step1[6] = dct_const_round_shift(temp2);
    549 
    550   step1[8] = step2[8] + step2[9];
    551   step1[9] = step2[8] - step2[9];
    552   step1[10] = -step2[10] + step2[11];
    553   step1[11] = step2[10] + step2[11];
    554   step1[12] = step2[12] + step2[13];
    555   step1[13] = step2[12] - step2[13];
    556   step1[14] = -step2[14] + step2[15];
    557   step1[15] = step2[14] + step2[15];
    558 
    559   // stage 4
    560   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    561   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    562   step2[0] = dct_const_round_shift(temp1);
    563   step2[1] = dct_const_round_shift(temp2);
    564   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    565   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    566   step2[2] = dct_const_round_shift(temp1);
    567   step2[3] = dct_const_round_shift(temp2);
    568   step2[4] = step1[4] + step1[5];
    569   step2[5] = step1[4] - step1[5];
    570   step2[6] = -step1[6] + step1[7];
    571   step2[7] = step1[6] + step1[7];
    572 
    573   step2[8] = step1[8];
    574   step2[15] = step1[15];
    575   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    576   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    577   step2[9] = dct_const_round_shift(temp1);
    578   step2[14] = dct_const_round_shift(temp2);
    579   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    580   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    581   step2[10] = dct_const_round_shift(temp1);
    582   step2[13] = dct_const_round_shift(temp2);
    583   step2[11] = step1[11];
    584   step2[12] = step1[12];
    585 
    586   // stage 5
    587   step1[0] = step2[0] + step2[3];
    588   step1[1] = step2[1] + step2[2];
    589   step1[2] = step2[1] - step2[2];
    590   step1[3] = step2[0] - step2[3];
    591   step1[4] = step2[4];
    592   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    593   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    594   step1[5] = dct_const_round_shift(temp1);
    595   step1[6] = dct_const_round_shift(temp2);
    596   step1[7] = step2[7];
    597 
    598   step1[8] = step2[8] + step2[11];
    599   step1[9] = step2[9] + step2[10];
    600   step1[10] = step2[9] - step2[10];
    601   step1[11] = step2[8] - step2[11];
    602   step1[12] = -step2[12] + step2[15];
    603   step1[13] = -step2[13] + step2[14];
    604   step1[14] = step2[13] + step2[14];
    605   step1[15] = step2[12] + step2[15];
    606 
    607   // stage 6
    608   step2[0] = step1[0] + step1[7];
    609   step2[1] = step1[1] + step1[6];
    610   step2[2] = step1[2] + step1[5];
    611   step2[3] = step1[3] + step1[4];
    612   step2[4] = step1[3] - step1[4];
    613   step2[5] = step1[2] - step1[5];
    614   step2[6] = step1[1] - step1[6];
    615   step2[7] = step1[0] - step1[7];
    616   step2[8] = step1[8];
    617   step2[9] = step1[9];
    618   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    619   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    620   step2[10] = dct_const_round_shift(temp1);
    621   step2[13] = dct_const_round_shift(temp2);
    622   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    623   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    624   step2[11] = dct_const_round_shift(temp1);
    625   step2[12] = dct_const_round_shift(temp2);
    626   step2[14] = step1[14];
    627   step2[15] = step1[15];
    628 
    629   // stage 7
    630   output[0] = step2[0] + step2[15];
    631   output[1] = step2[1] + step2[14];
    632   output[2] = step2[2] + step2[13];
    633   output[3] = step2[3] + step2[12];
    634   output[4] = step2[4] + step2[11];
    635   output[5] = step2[5] + step2[10];
    636   output[6] = step2[6] + step2[9];
    637   output[7] = step2[7] + step2[8];
    638   output[8] = step2[7] - step2[8];
    639   output[9] = step2[6] - step2[9];
    640   output[10] = step2[5] - step2[10];
    641   output[11] = step2[4] - step2[11];
    642   output[12] = step2[3] - step2[12];
    643   output[13] = step2[2] - step2[13];
    644   output[14] = step2[1] - step2[14];
    645   output[15] = step2[0] - step2[15];
    646 }
    647 
    648 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
    649                              int stride) {
    650   tran_low_t out[16 * 16];
    651   tran_low_t *outptr = out;
    652   int i, j;
    653   tran_low_t temp_in[16], temp_out[16];
    654 
    655   // First transform rows
    656   for (i = 0; i < 16; ++i) {
    657     idct16(input, outptr);
    658     input += 16;
    659     outptr += 16;
    660   }
    661 
    662   // Then transform columns
    663   for (i = 0; i < 16; ++i) {
    664     for (j = 0; j < 16; ++j)
    665       temp_in[j] = out[j * 16 + i];
    666     idct16(temp_in, temp_out);
    667     for (j = 0; j < 16; ++j)
    668       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
    669                                   + dest[j * stride + i]);
    670   }
    671 }
    672 
    673 static void iadst16(const tran_low_t *input, tran_low_t *output) {
    674   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    675   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    676 
    677   tran_high_t x0 = input[15];
    678   tran_high_t x1 = input[0];
    679   tran_high_t x2 = input[13];
    680   tran_high_t x3 = input[2];
    681   tran_high_t x4 = input[11];
    682   tran_high_t x5 = input[4];
    683   tran_high_t x6 = input[9];
    684   tran_high_t x7 = input[6];
    685   tran_high_t x8 = input[7];
    686   tran_high_t x9 = input[8];
    687   tran_high_t x10 = input[5];
    688   tran_high_t x11 = input[10];
    689   tran_high_t x12 = input[3];
    690   tran_high_t x13 = input[12];
    691   tran_high_t x14 = input[1];
    692   tran_high_t x15 = input[14];
    693 
    694   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
    695            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
    696     output[0] = output[1] = output[2] = output[3] = output[4]
    697               = output[5] = output[6] = output[7] = output[8]
    698               = output[9] = output[10] = output[11] = output[12]
    699               = output[13] = output[14] = output[15] = 0;
    700     return;
    701   }
    702 
    703   // stage 1
    704   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
    705   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    706   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
    707   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    708   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
    709   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    710   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    711   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    712   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    713   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    714   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    715   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    716   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    717   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
    718   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    719   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
    720 
    721   x0 = dct_const_round_shift(s0 + s8);
    722   x1 = dct_const_round_shift(s1 + s9);
    723   x2 = dct_const_round_shift(s2 + s10);
    724   x3 = dct_const_round_shift(s3 + s11);
    725   x4 = dct_const_round_shift(s4 + s12);
    726   x5 = dct_const_round_shift(s5 + s13);
    727   x6 = dct_const_round_shift(s6 + s14);
    728   x7 = dct_const_round_shift(s7 + s15);
    729   x8  = dct_const_round_shift(s0 - s8);
    730   x9  = dct_const_round_shift(s1 - s9);
    731   x10 = dct_const_round_shift(s2 - s10);
    732   x11 = dct_const_round_shift(s3 - s11);
    733   x12 = dct_const_round_shift(s4 - s12);
    734   x13 = dct_const_round_shift(s5 - s13);
    735   x14 = dct_const_round_shift(s6 - s14);
    736   x15 = dct_const_round_shift(s7 - s15);
    737 
    738   // stage 2
    739   s0 = x0;
    740   s1 = x1;
    741   s2 = x2;
    742   s3 = x3;
    743   s4 = x4;
    744   s5 = x5;
    745   s6 = x6;
    746   s7 = x7;
    747   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
    748   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
    749   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
    750   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
    751   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
    752   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
    753   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
    754   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
    755 
    756   x0 = s0 + s4;
    757   x1 = s1 + s5;
    758   x2 = s2 + s6;
    759   x3 = s3 + s7;
    760   x4 = s0 - s4;
    761   x5 = s1 - s5;
    762   x6 = s2 - s6;
    763   x7 = s3 - s7;
    764   x8 = dct_const_round_shift(s8 + s12);
    765   x9 = dct_const_round_shift(s9 + s13);
    766   x10 = dct_const_round_shift(s10 + s14);
    767   x11 = dct_const_round_shift(s11 + s15);
    768   x12 = dct_const_round_shift(s8 - s12);
    769   x13 = dct_const_round_shift(s9 - s13);
    770   x14 = dct_const_round_shift(s10 - s14);
    771   x15 = dct_const_round_shift(s11 - s15);
    772 
    773   // stage 3
    774   s0 = x0;
    775   s1 = x1;
    776   s2 = x2;
    777   s3 = x3;
    778   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
    779   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    780   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
    781   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
    782   s8 = x8;
    783   s9 = x9;
    784   s10 = x10;
    785   s11 = x11;
    786   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
    787   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    788   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
    789   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
    790 
    791   x0 = s0 + s2;
    792   x1 = s1 + s3;
    793   x2 = s0 - s2;
    794   x3 = s1 - s3;
    795   x4 = dct_const_round_shift(s4 + s6);
    796   x5 = dct_const_round_shift(s5 + s7);
    797   x6 = dct_const_round_shift(s4 - s6);
    798   x7 = dct_const_round_shift(s5 - s7);
    799   x8 = s8 + s10;
    800   x9 = s9 + s11;
    801   x10 = s8 - s10;
    802   x11 = s9 - s11;
    803   x12 = dct_const_round_shift(s12 + s14);
    804   x13 = dct_const_round_shift(s13 + s15);
    805   x14 = dct_const_round_shift(s12 - s14);
    806   x15 = dct_const_round_shift(s13 - s15);
    807 
    808   // stage 4
    809   s2 = (- cospi_16_64) * (x2 + x3);
    810   s3 = cospi_16_64 * (x2 - x3);
    811   s6 = cospi_16_64 * (x6 + x7);
    812   s7 = cospi_16_64 * (- x6 + x7);
    813   s10 = cospi_16_64 * (x10 + x11);
    814   s11 = cospi_16_64 * (- x10 + x11);
    815   s14 = (- cospi_16_64) * (x14 + x15);
    816   s15 = cospi_16_64 * (x14 - x15);
    817 
    818   x2 = dct_const_round_shift(s2);
    819   x3 = dct_const_round_shift(s3);
    820   x6 = dct_const_round_shift(s6);
    821   x7 = dct_const_round_shift(s7);
    822   x10 = dct_const_round_shift(s10);
    823   x11 = dct_const_round_shift(s11);
    824   x14 = dct_const_round_shift(s14);
    825   x15 = dct_const_round_shift(s15);
    826 
    827   output[0] =  x0;
    828   output[1] = -x8;
    829   output[2] =  x12;
    830   output[3] = -x4;
    831   output[4] =  x6;
    832   output[5] =  x14;
    833   output[6] =  x10;
    834   output[7] =  x2;
    835   output[8] =  x3;
    836   output[9] =  x11;
    837   output[10] =  x15;
    838   output[11] =  x7;
    839   output[12] =  x5;
    840   output[13] = -x13;
    841   output[14] =  x9;
    842   output[15] = -x1;
    843 }
    844 
    845 static const transform_2d IHT_16[] = {
    846   { idct16,  idct16  },  // DCT_DCT  = 0
    847   { iadst16, idct16  },  // ADST_DCT = 1
    848   { idct16,  iadst16 },  // DCT_ADST = 2
    849   { iadst16, iadst16 }   // ADST_ADST = 3
    850 };
    851 
    852 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
    853                             int tx_type) {
    854   int i, j;
    855   tran_low_t out[16 * 16];
    856   tran_low_t *outptr = out;
    857   tran_low_t temp_in[16], temp_out[16];
    858   const transform_2d ht = IHT_16[tx_type];
    859 
    860   // Rows
    861   for (i = 0; i < 16; ++i) {
    862     ht.rows(input, outptr);
    863     input += 16;
    864     outptr += 16;
    865   }
    866 
    867   // Columns
    868   for (i = 0; i < 16; ++i) {
    869     for (j = 0; j < 16; ++j)
    870       temp_in[j] = out[j * 16 + i];
    871     ht.cols(temp_in, temp_out);
    872     for (j = 0; j < 16; ++j)
    873       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
    874                                         + dest[j * stride + i]);
    875   }
    876 }
    877 
    878 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
    879                             int stride) {
    880   tran_low_t out[16 * 16] = { 0 };
    881   tran_low_t *outptr = out;
    882   int i, j;
    883   tran_low_t temp_in[16], temp_out[16];
    884 
    885   // First transform rows. Since all non-zero dct coefficients are in
    886   // upper-left 4x4 area, we only need to calculate first 4 rows here.
    887   for (i = 0; i < 4; ++i) {
    888     idct16(input, outptr);
    889     input += 16;
    890     outptr += 16;
    891   }
    892 
    893   // Then transform columns
    894   for (i = 0; i < 16; ++i) {
    895     for (j = 0; j < 16; ++j)
    896       temp_in[j] = out[j*16 + i];
    897     idct16(temp_in, temp_out);
    898     for (j = 0; j < 16; ++j)
    899       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
    900                                   + dest[j * stride + i]);
    901   }
    902 }
    903 
    904 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    905   int i, j;
    906   tran_high_t a1;
    907   tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    908   out = dct_const_round_shift(out * cospi_16_64);
    909   a1 = ROUND_POWER_OF_TWO(out, 6);
    910   for (j = 0; j < 16; ++j) {
    911     for (i = 0; i < 16; ++i)
    912       dest[i] = clip_pixel(dest[i] + a1);
    913     dest += stride;
    914   }
    915 }
    916 
    917 static void idct32(const tran_low_t *input, tran_low_t *output) {
    918   tran_low_t step1[32], step2[32];
    919   tran_high_t temp1, temp2;
    920 
    921   // stage 1
    922   step1[0] = input[0];
    923   step1[1] = input[16];
    924   step1[2] = input[8];
    925   step1[3] = input[24];
    926   step1[4] = input[4];
    927   step1[5] = input[20];
    928   step1[6] = input[12];
    929   step1[7] = input[28];
    930   step1[8] = input[2];
    931   step1[9] = input[18];
    932   step1[10] = input[10];
    933   step1[11] = input[26];
    934   step1[12] = input[6];
    935   step1[13] = input[22];
    936   step1[14] = input[14];
    937   step1[15] = input[30];
    938 
    939   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
    940   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
    941   step1[16] = dct_const_round_shift(temp1);
    942   step1[31] = dct_const_round_shift(temp2);
    943 
    944   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
    945   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
    946   step1[17] = dct_const_round_shift(temp1);
    947   step1[30] = dct_const_round_shift(temp2);
    948 
    949   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
    950   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
    951   step1[18] = dct_const_round_shift(temp1);
    952   step1[29] = dct_const_round_shift(temp2);
    953 
    954   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
    955   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
    956   step1[19] = dct_const_round_shift(temp1);
    957   step1[28] = dct_const_round_shift(temp2);
    958 
    959   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
    960   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
    961   step1[20] = dct_const_round_shift(temp1);
    962   step1[27] = dct_const_round_shift(temp2);
    963 
    964   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
    965   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
    966   step1[21] = dct_const_round_shift(temp1);
    967   step1[26] = dct_const_round_shift(temp2);
    968 
    969   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
    970   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
    971   step1[22] = dct_const_round_shift(temp1);
    972   step1[25] = dct_const_round_shift(temp2);
    973 
    974   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
    975   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
    976   step1[23] = dct_const_round_shift(temp1);
    977   step1[24] = dct_const_round_shift(temp2);
    978 
    979   // stage 2
    980   step2[0] = step1[0];
    981   step2[1] = step1[1];
    982   step2[2] = step1[2];
    983   step2[3] = step1[3];
    984   step2[4] = step1[4];
    985   step2[5] = step1[5];
    986   step2[6] = step1[6];
    987   step2[7] = step1[7];
    988 
    989   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    990   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    991   step2[8] = dct_const_round_shift(temp1);
    992   step2[15] = dct_const_round_shift(temp2);
    993 
    994   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    995   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    996   step2[9] = dct_const_round_shift(temp1);
    997   step2[14] = dct_const_round_shift(temp2);
    998 
    999   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   1000   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   1001   step2[10] = dct_const_round_shift(temp1);
   1002   step2[13] = dct_const_round_shift(temp2);
   1003 
   1004   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   1005   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   1006   step2[11] = dct_const_round_shift(temp1);
   1007   step2[12] = dct_const_round_shift(temp2);
   1008 
   1009   step2[16] = step1[16] + step1[17];
   1010   step2[17] = step1[16] - step1[17];
   1011   step2[18] = -step1[18] + step1[19];
   1012   step2[19] = step1[18] + step1[19];
   1013   step2[20] = step1[20] + step1[21];
   1014   step2[21] = step1[20] - step1[21];
   1015   step2[22] = -step1[22] + step1[23];
   1016   step2[23] = step1[22] + step1[23];
   1017   step2[24] = step1[24] + step1[25];
   1018   step2[25] = step1[24] - step1[25];
   1019   step2[26] = -step1[26] + step1[27];
   1020   step2[27] = step1[26] + step1[27];
   1021   step2[28] = step1[28] + step1[29];
   1022   step2[29] = step1[28] - step1[29];
   1023   step2[30] = -step1[30] + step1[31];
   1024   step2[31] = step1[30] + step1[31];
   1025 
   1026   // stage 3
   1027   step1[0] = step2[0];
   1028   step1[1] = step2[1];
   1029   step1[2] = step2[2];
   1030   step1[3] = step2[3];
   1031 
   1032   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   1033   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   1034   step1[4] = dct_const_round_shift(temp1);
   1035   step1[7] = dct_const_round_shift(temp2);
   1036   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   1037   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   1038   step1[5] = dct_const_round_shift(temp1);
   1039   step1[6] = dct_const_round_shift(temp2);
   1040 
   1041   step1[8] = step2[8] + step2[9];
   1042   step1[9] = step2[8] - step2[9];
   1043   step1[10] = -step2[10] + step2[11];
   1044   step1[11] = step2[10] + step2[11];
   1045   step1[12] = step2[12] + step2[13];
   1046   step1[13] = step2[12] - step2[13];
   1047   step1[14] = -step2[14] + step2[15];
   1048   step1[15] = step2[14] + step2[15];
   1049 
   1050   step1[16] = step2[16];
   1051   step1[31] = step2[31];
   1052   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   1053   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
   1054   step1[17] = dct_const_round_shift(temp1);
   1055   step1[30] = dct_const_round_shift(temp2);
   1056   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   1057   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
   1058   step1[18] = dct_const_round_shift(temp1);
   1059   step1[29] = dct_const_round_shift(temp2);
   1060   step1[19] = step2[19];
   1061   step1[20] = step2[20];
   1062   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   1063   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
   1064   step1[21] = dct_const_round_shift(temp1);
   1065   step1[26] = dct_const_round_shift(temp2);
   1066   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   1067   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
   1068   step1[22] = dct_const_round_shift(temp1);
   1069   step1[25] = dct_const_round_shift(temp2);
   1070   step1[23] = step2[23];
   1071   step1[24] = step2[24];
   1072   step1[27] = step2[27];
   1073   step1[28] = step2[28];
   1074 
   1075   // stage 4
   1076   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   1077   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   1078   step2[0] = dct_const_round_shift(temp1);
   1079   step2[1] = dct_const_round_shift(temp2);
   1080   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   1081   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   1082   step2[2] = dct_const_round_shift(temp1);
   1083   step2[3] = dct_const_round_shift(temp2);
   1084   step2[4] = step1[4] + step1[5];
   1085   step2[5] = step1[4] - step1[5];
   1086   step2[6] = -step1[6] + step1[7];
   1087   step2[7] = step1[6] + step1[7];
   1088 
   1089   step2[8] = step1[8];
   1090   step2[15] = step1[15];
   1091   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   1092   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   1093   step2[9] = dct_const_round_shift(temp1);
   1094   step2[14] = dct_const_round_shift(temp2);
   1095   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   1096   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   1097   step2[10] = dct_const_round_shift(temp1);
   1098   step2[13] = dct_const_round_shift(temp2);
   1099   step2[11] = step1[11];
   1100   step2[12] = step1[12];
   1101 
   1102   step2[16] = step1[16] + step1[19];
   1103   step2[17] = step1[17] + step1[18];
   1104   step2[18] = step1[17] - step1[18];
   1105   step2[19] = step1[16] - step1[19];
   1106   step2[20] = -step1[20] + step1[23];
   1107   step2[21] = -step1[21] + step1[22];
   1108   step2[22] = step1[21] + step1[22];
   1109   step2[23] = step1[20] + step1[23];
   1110 
   1111   step2[24] = step1[24] + step1[27];
   1112   step2[25] = step1[25] + step1[26];
   1113   step2[26] = step1[25] - step1[26];
   1114   step2[27] = step1[24] - step1[27];
   1115   step2[28] = -step1[28] + step1[31];
   1116   step2[29] = -step1[29] + step1[30];
   1117   step2[30] = step1[29] + step1[30];
   1118   step2[31] = step1[28] + step1[31];
   1119 
   1120   // stage 5
   1121   step1[0] = step2[0] + step2[3];
   1122   step1[1] = step2[1] + step2[2];
   1123   step1[2] = step2[1] - step2[2];
   1124   step1[3] = step2[0] - step2[3];
   1125   step1[4] = step2[4];
   1126   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1127   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1128   step1[5] = dct_const_round_shift(temp1);
   1129   step1[6] = dct_const_round_shift(temp2);
   1130   step1[7] = step2[7];
   1131 
   1132   step1[8] = step2[8] + step2[11];
   1133   step1[9] = step2[9] + step2[10];
   1134   step1[10] = step2[9] - step2[10];
   1135   step1[11] = step2[8] - step2[11];
   1136   step1[12] = -step2[12] + step2[15];
   1137   step1[13] = -step2[13] + step2[14];
   1138   step1[14] = step2[13] + step2[14];
   1139   step1[15] = step2[12] + step2[15];
   1140 
   1141   step1[16] = step2[16];
   1142   step1[17] = step2[17];
   1143   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   1144   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   1145   step1[18] = dct_const_round_shift(temp1);
   1146   step1[29] = dct_const_round_shift(temp2);
   1147   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   1148   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   1149   step1[19] = dct_const_round_shift(temp1);
   1150   step1[28] = dct_const_round_shift(temp2);
   1151   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   1152   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   1153   step1[20] = dct_const_round_shift(temp1);
   1154   step1[27] = dct_const_round_shift(temp2);
   1155   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   1156   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   1157   step1[21] = dct_const_round_shift(temp1);
   1158   step1[26] = dct_const_round_shift(temp2);
   1159   step1[22] = step2[22];
   1160   step1[23] = step2[23];
   1161   step1[24] = step2[24];
   1162   step1[25] = step2[25];
   1163   step1[30] = step2[30];
   1164   step1[31] = step2[31];
   1165 
   1166   // stage 6
   1167   step2[0] = step1[0] + step1[7];
   1168   step2[1] = step1[1] + step1[6];
   1169   step2[2] = step1[2] + step1[5];
   1170   step2[3] = step1[3] + step1[4];
   1171   step2[4] = step1[3] - step1[4];
   1172   step2[5] = step1[2] - step1[5];
   1173   step2[6] = step1[1] - step1[6];
   1174   step2[7] = step1[0] - step1[7];
   1175   step2[8] = step1[8];
   1176   step2[9] = step1[9];
   1177   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   1178   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   1179   step2[10] = dct_const_round_shift(temp1);
   1180   step2[13] = dct_const_round_shift(temp2);
   1181   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   1182   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   1183   step2[11] = dct_const_round_shift(temp1);
   1184   step2[12] = dct_const_round_shift(temp2);
   1185   step2[14] = step1[14];
   1186   step2[15] = step1[15];
   1187 
   1188   step2[16] = step1[16] + step1[23];
   1189   step2[17] = step1[17] + step1[22];
   1190   step2[18] = step1[18] + step1[21];
   1191   step2[19] = step1[19] + step1[20];
   1192   step2[20] = step1[19] - step1[20];
   1193   step2[21] = step1[18] - step1[21];
   1194   step2[22] = step1[17] - step1[22];
   1195   step2[23] = step1[16] - step1[23];
   1196 
   1197   step2[24] = -step1[24] + step1[31];
   1198   step2[25] = -step1[25] + step1[30];
   1199   step2[26] = -step1[26] + step1[29];
   1200   step2[27] = -step1[27] + step1[28];
   1201   step2[28] = step1[27] + step1[28];
   1202   step2[29] = step1[26] + step1[29];
   1203   step2[30] = step1[25] + step1[30];
   1204   step2[31] = step1[24] + step1[31];
   1205 
   1206   // stage 7
   1207   step1[0] = step2[0] + step2[15];
   1208   step1[1] = step2[1] + step2[14];
   1209   step1[2] = step2[2] + step2[13];
   1210   step1[3] = step2[3] + step2[12];
   1211   step1[4] = step2[4] + step2[11];
   1212   step1[5] = step2[5] + step2[10];
   1213   step1[6] = step2[6] + step2[9];
   1214   step1[7] = step2[7] + step2[8];
   1215   step1[8] = step2[7] - step2[8];
   1216   step1[9] = step2[6] - step2[9];
   1217   step1[10] = step2[5] - step2[10];
   1218   step1[11] = step2[4] - step2[11];
   1219   step1[12] = step2[3] - step2[12];
   1220   step1[13] = step2[2] - step2[13];
   1221   step1[14] = step2[1] - step2[14];
   1222   step1[15] = step2[0] - step2[15];
   1223 
   1224   step1[16] = step2[16];
   1225   step1[17] = step2[17];
   1226   step1[18] = step2[18];
   1227   step1[19] = step2[19];
   1228   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   1229   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   1230   step1[20] = dct_const_round_shift(temp1);
   1231   step1[27] = dct_const_round_shift(temp2);
   1232   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   1233   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   1234   step1[21] = dct_const_round_shift(temp1);
   1235   step1[26] = dct_const_round_shift(temp2);
   1236   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   1237   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   1238   step1[22] = dct_const_round_shift(temp1);
   1239   step1[25] = dct_const_round_shift(temp2);
   1240   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   1241   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   1242   step1[23] = dct_const_round_shift(temp1);
   1243   step1[24] = dct_const_round_shift(temp2);
   1244   step1[28] = step2[28];
   1245   step1[29] = step2[29];
   1246   step1[30] = step2[30];
   1247   step1[31] = step2[31];
   1248 
   1249   // final stage
   1250   output[0] = step1[0] + step1[31];
   1251   output[1] = step1[1] + step1[30];
   1252   output[2] = step1[2] + step1[29];
   1253   output[3] = step1[3] + step1[28];
   1254   output[4] = step1[4] + step1[27];
   1255   output[5] = step1[5] + step1[26];
   1256   output[6] = step1[6] + step1[25];
   1257   output[7] = step1[7] + step1[24];
   1258   output[8] = step1[8] + step1[23];
   1259   output[9] = step1[9] + step1[22];
   1260   output[10] = step1[10] + step1[21];
   1261   output[11] = step1[11] + step1[20];
   1262   output[12] = step1[12] + step1[19];
   1263   output[13] = step1[13] + step1[18];
   1264   output[14] = step1[14] + step1[17];
   1265   output[15] = step1[15] + step1[16];
   1266   output[16] = step1[15] - step1[16];
   1267   output[17] = step1[14] - step1[17];
   1268   output[18] = step1[13] - step1[18];
   1269   output[19] = step1[12] - step1[19];
   1270   output[20] = step1[11] - step1[20];
   1271   output[21] = step1[10] - step1[21];
   1272   output[22] = step1[9] - step1[22];
   1273   output[23] = step1[8] - step1[23];
   1274   output[24] = step1[7] - step1[24];
   1275   output[25] = step1[6] - step1[25];
   1276   output[26] = step1[5] - step1[26];
   1277   output[27] = step1[4] - step1[27];
   1278   output[28] = step1[3] - step1[28];
   1279   output[29] = step1[2] - step1[29];
   1280   output[30] = step1[1] - step1[30];
   1281   output[31] = step1[0] - step1[31];
   1282 }
   1283 
   1284 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
   1285                               int stride) {
   1286   tran_low_t out[32 * 32];
   1287   tran_low_t *outptr = out;
   1288   int i, j;
   1289   tran_low_t temp_in[32], temp_out[32];
   1290 
   1291   // Rows
   1292   for (i = 0; i < 32; ++i) {
   1293     int16_t zero_coeff[16];
   1294     for (j = 0; j < 16; ++j)
   1295       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
   1296     for (j = 0; j < 8; ++j)
   1297       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   1298     for (j = 0; j < 4; ++j)
   1299       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   1300     for (j = 0; j < 2; ++j)
   1301       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   1302 
   1303     if (zero_coeff[0] | zero_coeff[1])
   1304       idct32(input, outptr);
   1305     else
   1306       vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
   1307     input += 32;
   1308     outptr += 32;
   1309   }
   1310 
   1311   // Columns
   1312   for (i = 0; i < 32; ++i) {
   1313     for (j = 0; j < 32; ++j)
   1314       temp_in[j] = out[j * 32 + i];
   1315     idct32(temp_in, temp_out);
   1316     for (j = 0; j < 32; ++j)
   1317       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   1318                                         + dest[j * stride + i]);
   1319   }
   1320 }
   1321 
   1322 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
   1323                             int stride) {
   1324   tran_low_t out[32 * 32] = {0};
   1325   tran_low_t *outptr = out;
   1326   int i, j;
   1327   tran_low_t temp_in[32], temp_out[32];
   1328 
   1329   // Rows
   1330   // only upper-left 8x8 has non-zero coeff
   1331   for (i = 0; i < 8; ++i) {
   1332     idct32(input, outptr);
   1333     input += 32;
   1334     outptr += 32;
   1335   }
   1336 
   1337   // Columns
   1338   for (i = 0; i < 32; ++i) {
   1339     for (j = 0; j < 32; ++j)
   1340       temp_in[j] = out[j * 32 + i];
   1341     idct32(temp_in, temp_out);
   1342     for (j = 0; j < 32; ++j)
   1343       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   1344                                   + dest[j * stride + i]);
   1345   }
   1346 }
   1347 
   1348 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   1349   int i, j;
   1350   tran_high_t a1;
   1351 
   1352   tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
   1353   out = dct_const_round_shift(out * cospi_16_64);
   1354   a1 = ROUND_POWER_OF_TWO(out, 6);
   1355 
   1356   for (j = 0; j < 32; ++j) {
   1357     for (i = 0; i < 32; ++i)
   1358       dest[i] = clip_pixel(dest[i] + a1);
   1359     dest += stride;
   1360   }
   1361 }
   1362 
   1363 // idct
   1364 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
   1365                      int eob) {
   1366   if (eob > 1)
   1367     vp9_idct4x4_16_add(input, dest, stride);
   1368   else
   1369     vp9_idct4x4_1_add(input, dest, stride);
   1370 }
   1371 
   1372 
   1373 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
   1374                      int eob) {
   1375   if (eob > 1)
   1376     vp9_iwht4x4_16_add(input, dest, stride);
   1377   else
   1378     vp9_iwht4x4_1_add(input, dest, stride);
   1379 }
   1380 
   1381 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   1382                      int eob) {
   1383   // If dc is 1, then input[0] is the reconstructed value, do not need
   1384   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
   1385 
   1386   // The calculation can be simplified if there are not many non-zero dct
   1387   // coefficients. Use eobs to decide what to do.
   1388   // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
   1389   // Combine that with code here.
   1390   if (eob == 1)
   1391     // DC only DCT coefficient
   1392     vp9_idct8x8_1_add(input, dest, stride);
   1393   else if (eob <= 12)
   1394     vp9_idct8x8_12_add(input, dest, stride);
   1395   else
   1396     vp9_idct8x8_64_add(input, dest, stride);
   1397 }
   1398 
   1399 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
   1400                        int eob) {
   1401   /* The calculation can be simplified if there are not many non-zero dct
   1402    * coefficients. Use eobs to separate different cases. */
   1403   if (eob == 1)
   1404     /* DC only DCT coefficient. */
   1405     vp9_idct16x16_1_add(input, dest, stride);
   1406   else if (eob <= 10)
   1407     vp9_idct16x16_10_add(input, dest, stride);
   1408   else
   1409     vp9_idct16x16_256_add(input, dest, stride);
   1410 }
   1411 
   1412 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
   1413                        int eob) {
   1414   if (eob == 1)
   1415     vp9_idct32x32_1_add(input, dest, stride);
   1416   else if (eob <= 34)
   1417     // non-zero coeff only in upper-left 8x8
   1418     vp9_idct32x32_34_add(input, dest, stride);
   1419   else
   1420     vp9_idct32x32_1024_add(input, dest, stride);
   1421 }
   1422 
   1423 // iht
   1424 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
   1425                     int stride, int eob) {
   1426   if (tx_type == DCT_DCT)
   1427     vp9_idct4x4_add(input, dest, stride, eob);
   1428   else
   1429     vp9_iht4x4_16_add(input, dest, stride, tx_type);
   1430 }
   1431 
   1432 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
   1433                     int stride, int eob) {
   1434   if (tx_type == DCT_DCT) {
   1435     vp9_idct8x8_add(input, dest, stride, eob);
   1436   } else {
   1437     vp9_iht8x8_64_add(input, dest, stride, tx_type);
   1438   }
   1439 }
   1440 
   1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
   1442                       int stride, int eob) {
   1443   if (tx_type == DCT_DCT) {
   1444     vp9_idct16x16_add(input, dest, stride, eob);
   1445   } else {
   1446     vp9_iht16x16_256_add(input, dest, stride, tx_type);
   1447   }
   1448 }
   1449 
   1450 #if CONFIG_VP9_HIGHBITDEPTH
   1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   1452                                int stride, int bd) {
   1453   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
   1454      0.5 shifts per pixel. */
   1455   int i;
   1456   tran_low_t output[16];
   1457   tran_high_t a1, b1, c1, d1, e1;
   1458   const tran_low_t *ip = input;
   1459   tran_low_t *op = output;
   1460   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1461 
   1462   for (i = 0; i < 4; i++) {
   1463     a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1464     c1 = ip[1] >> UNIT_QUANT_SHIFT;
   1465     d1 = ip[2] >> UNIT_QUANT_SHIFT;
   1466     b1 = ip[3] >> UNIT_QUANT_SHIFT;
   1467     a1 += c1;
   1468     d1 -= b1;
   1469     e1 = (a1 - d1) >> 1;
   1470     b1 = e1 - b1;
   1471     c1 = e1 - c1;
   1472     a1 -= b1;
   1473     d1 += c1;
   1474     op[0] = WRAPLOW(a1);
   1475     op[1] = WRAPLOW(b1);
   1476     op[2] = WRAPLOW(c1);
   1477     op[3] = WRAPLOW(d1);
   1478     ip += 4;
   1479     op += 4;
   1480   }
   1481 
   1482   ip = output;
   1483   for (i = 0; i < 4; i++) {
   1484     a1 = ip[4 * 0];
   1485     c1 = ip[4 * 1];
   1486     d1 = ip[4 * 2];
   1487     b1 = ip[4 * 3];
   1488     a1 += c1;
   1489     d1 -= b1;
   1490     e1 = (a1 - d1) >> 1;
   1491     b1 = e1 - b1;
   1492     c1 = e1 - c1;
   1493     a1 -= b1;
   1494     d1 += c1;
   1495     dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
   1496     dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
   1497     dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
   1498     dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
   1499 
   1500     ip++;
   1501     dest++;
   1502   }
   1503 }
   1504 
   1505 static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
   1506   tran_low_t step[4];
   1507   tran_high_t temp1, temp2;
   1508   (void) bd;
   1509   // stage 1
   1510   temp1 = (input[0] + input[2]) * cospi_16_64;
   1511   temp2 = (input[0] - input[2]) * cospi_16_64;
   1512   step[0] = WRAPLOW(dct_const_round_shift(temp1));
   1513   step[1] = WRAPLOW(dct_const_round_shift(temp2));
   1514   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   1515   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
   1516   step[2] = WRAPLOW(dct_const_round_shift(temp1));
   1517   step[3] = WRAPLOW(dct_const_round_shift(temp2));
   1518 
   1519   // stage 2
   1520   output[0] = WRAPLOW(step[0] + step[3]);
   1521   output[1] = WRAPLOW(step[1] + step[2]);
   1522   output[2] = WRAPLOW(step[1] - step[2]);
   1523   output[3] = WRAPLOW(step[0] - step[3]);
   1524 }
   1525 
   1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   1527                               int dest_stride, int bd) {
   1528   int i;
   1529   tran_high_t a1, e1;
   1530   tran_low_t tmp[4];
   1531   const tran_low_t *ip = in;
   1532   tran_low_t *op = tmp;
   1533   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1534   (void) bd;
   1535 
   1536   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1537   e1 = a1 >> 1;
   1538   a1 -= e1;
   1539   op[0] = WRAPLOW(a1);
   1540   op[1] = op[2] = op[3] = WRAPLOW(e1);
   1541 
   1542   ip = tmp;
   1543   for (i = 0; i < 4; i++) {
   1544     e1 = ip[0] >> 1;
   1545     a1 = ip[0] - e1;
   1546     dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
   1547     dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
   1548     dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
   1549     dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
   1550     ip++;
   1551     dest++;
   1552   }
   1553 }
   1554 
   1555 void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   1556                                int stride, int bd) {
   1557   tran_low_t out[4 * 4];
   1558   tran_low_t *outptr = out;
   1559   int i, j;
   1560   tran_low_t temp_in[4], temp_out[4];
   1561   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1562 
   1563   // Rows
   1564   for (i = 0; i < 4; ++i) {
   1565     high_idct4(input, outptr, bd);
   1566     input += 4;
   1567     outptr += 4;
   1568   }
   1569 
   1570   // Columns
   1571   for (i = 0; i < 4; ++i) {
   1572     for (j = 0; j < 4; ++j)
   1573       temp_in[j] = out[j * 4 + i];
   1574     high_idct4(temp_in, temp_out, bd);
   1575     for (j = 0; j < 4; ++j)
   1576       dest[j * stride + i] = clip_pixel_bd_high(
   1577           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
   1578   }
   1579 }
   1580 
   1581 void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
   1582                               int dest_stride, int bd) {
   1583   int i;
   1584   tran_high_t a1;
   1585   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
   1586   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1587 
   1588   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   1589   a1 = ROUND_POWER_OF_TWO(out, 4);
   1590 
   1591   for (i = 0; i < 4; i++) {
   1592     dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
   1593     dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
   1594     dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
   1595     dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
   1596     dest += dest_stride;
   1597   }
   1598 }
   1599 
   1600 static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   1601   tran_low_t step1[8], step2[8];
   1602   tran_high_t temp1, temp2;
   1603   // stage 1
   1604   step1[0] = input[0];
   1605   step1[2] = input[4];
   1606   step1[1] = input[2];
   1607   step1[3] = input[6];
   1608   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   1609   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
   1610   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
   1611   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   1612   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   1613   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
   1614   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   1615   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   1616 
   1617   // stage 2 & stage 3 - even half
   1618   high_idct4(step1, step1, bd);
   1619 
   1620   // stage 2 - odd half
   1621   step2[4] = WRAPLOW(step1[4] + step1[5]);
   1622   step2[5] = WRAPLOW(step1[4] - step1[5]);
   1623   step2[6] = WRAPLOW(-step1[6] + step1[7]);
   1624   step2[7] = WRAPLOW(step1[6] + step1[7]);
   1625 
   1626   // stage 3 - odd half
   1627   step1[4] = step2[4];
   1628   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1629   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1630   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   1631   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   1632   step1[7] = step2[7];
   1633 
   1634   // stage 4
   1635   output[0] = WRAPLOW(step1[0] + step1[7]);
   1636   output[1] = WRAPLOW(step1[1] + step1[6]);
   1637   output[2] = WRAPLOW(step1[2] + step1[5]);
   1638   output[3] = WRAPLOW(step1[3] + step1[4]);
   1639   output[4] = WRAPLOW(step1[3] - step1[4]);
   1640   output[5] = WRAPLOW(step1[2] - step1[5]);
   1641   output[6] = WRAPLOW(step1[1] - step1[6]);
   1642   output[7] = WRAPLOW(step1[0] - step1[7]);
   1643 }
   1644 
   1645 void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
   1646                                int stride, int bd) {
   1647   tran_low_t out[8 * 8];
   1648   tran_low_t *outptr = out;
   1649   int i, j;
   1650   tran_low_t temp_in[8], temp_out[8];
   1651   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1652 
   1653   // First transform rows.
   1654   for (i = 0; i < 8; ++i) {
   1655     high_idct8(input, outptr, bd);
   1656     input += 8;
   1657     outptr += 8;
   1658   }
   1659 
   1660   // Then transform columns.
   1661   for (i = 0; i < 8; ++i) {
   1662     for (j = 0; j < 8; ++j)
   1663       temp_in[j] = out[j * 8 + i];
   1664     high_idct8(temp_in, temp_out, bd);
   1665     for (j = 0; j < 8; ++j)
   1666       dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
   1667                                         ROUND_POWER_OF_TWO(temp_out[j], 5),
   1668                                         bd);
   1669   }
   1670 }
   1671 
   1672 void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
   1673                               int stride, int bd) {
   1674   int i, j;
   1675   tran_high_t a1;
   1676   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
   1677   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1678   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   1679   a1 = ROUND_POWER_OF_TWO(out, 5);
   1680   for (j = 0; j < 8; ++j) {
   1681     for (i = 0; i < 8; ++i)
   1682       dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
   1683     dest += stride;
   1684   }
   1685 }
   1686 
   1687 static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
   1688   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1689 
   1690   tran_high_t x0 = input[0];
   1691   tran_high_t x1 = input[1];
   1692   tran_high_t x2 = input[2];
   1693   tran_high_t x3 = input[3];
   1694   (void) bd;
   1695 
   1696   if (!(x0 | x1 | x2 | x3)) {
   1697     vpx_memset(output, 0, 4 * sizeof(*output));
   1698     return;
   1699   }
   1700 
   1701   s0 = sinpi_1_9 * x0;
   1702   s1 = sinpi_2_9 * x0;
   1703   s2 = sinpi_3_9 * x1;
   1704   s3 = sinpi_4_9 * x2;
   1705   s4 = sinpi_1_9 * x2;
   1706   s5 = sinpi_2_9 * x3;
   1707   s6 = sinpi_4_9 * x3;
   1708   s7 = x0 - x2 + x3;
   1709 
   1710   x0 = s0 + s3 + s5;
   1711   x1 = s1 - s4 - s6;
   1712   x2 = sinpi_3_9 * s7;
   1713   x3 = s2;
   1714 
   1715   s0 = x0 + x3;
   1716   s1 = x1 + x3;
   1717   s2 = x2;
   1718   s3 = x0 + x1 - x3;
   1719 
   1720   // 1-D transform scaling factor is sqrt(2).
   1721   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   1722   // + 1b (addition) = 29b.
   1723   // Hence the output bit depth is 15b.
   1724   output[0] = WRAPLOW(dct_const_round_shift(s0));
   1725   output[1] = WRAPLOW(dct_const_round_shift(s1));
   1726   output[2] = WRAPLOW(dct_const_round_shift(s2));
   1727   output[3] = WRAPLOW(dct_const_round_shift(s3));
   1728 }
   1729 
   1730 void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   1731                               int stride, int tx_type, int bd) {
   1732   const high_transform_2d IHT_4[] = {
   1733     { high_idct4, high_idct4  },    // DCT_DCT  = 0
   1734     { high_iadst4, high_idct4 },    // ADST_DCT = 1
   1735     { high_idct4, high_iadst4 },    // DCT_ADST = 2
   1736     { high_iadst4, high_iadst4 }    // ADST_ADST = 3
   1737   };
   1738   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1739 
   1740   int i, j;
   1741   tran_low_t out[4 * 4];
   1742   tran_low_t *outptr = out;
   1743   tran_low_t temp_in[4], temp_out[4];
   1744 
   1745   // Inverse transform row vectors.
   1746   for (i = 0; i < 4; ++i) {
   1747     IHT_4[tx_type].rows(input, outptr, bd);
   1748     input  += 4;
   1749     outptr += 4;
   1750   }
   1751 
   1752   // Inverse transform column vectors.
   1753   for (i = 0; i < 4; ++i) {
   1754     for (j = 0; j < 4; ++j)
   1755       temp_in[j] = out[j * 4 + i];
   1756     IHT_4[tx_type].cols(temp_in, temp_out, bd);
   1757     for (j = 0; j < 4; ++j)
   1758       dest[j * stride + i] = clip_pixel_bd_high(
   1759           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
   1760   }
   1761 }
   1762 
   1763 static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   1764   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1765 
   1766   tran_high_t x0 = input[7];
   1767   tran_high_t x1 = input[0];
   1768   tran_high_t x2 = input[5];
   1769   tran_high_t x3 = input[2];
   1770   tran_high_t x4 = input[3];
   1771   tran_high_t x5 = input[4];
   1772   tran_high_t x6 = input[1];
   1773   tran_high_t x7 = input[6];
   1774   (void) bd;
   1775 
   1776   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
   1777     vpx_memset(output, 0, 8 * sizeof(*output));
   1778     return;
   1779   }
   1780 
   1781   // stage 1
   1782   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
   1783   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
   1784   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   1785   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   1786   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   1787   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   1788   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   1789   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
   1790 
   1791   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
   1792   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
   1793   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
   1794   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
   1795   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
   1796   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
   1797   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
   1798   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
   1799 
   1800   // stage 2
   1801   s0 = x0;
   1802   s1 = x1;
   1803   s2 = x2;
   1804   s3 = x3;
   1805   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
   1806   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
   1807   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   1808   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
   1809 
   1810   x0 = s0 + s2;
   1811   x1 = s1 + s3;
   1812   x2 = s0 - s2;
   1813   x3 = s1 - s3;
   1814   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
   1815   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
   1816   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
   1817   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
   1818 
   1819   // stage 3
   1820   s2 = cospi_16_64 * (x2 + x3);
   1821   s3 = cospi_16_64 * (x2 - x3);
   1822   s6 = cospi_16_64 * (x6 + x7);
   1823   s7 = cospi_16_64 * (x6 - x7);
   1824 
   1825   x2 = WRAPLOW(dct_const_round_shift(s2));
   1826   x3 = WRAPLOW(dct_const_round_shift(s3));
   1827   x6 = WRAPLOW(dct_const_round_shift(s6));
   1828   x7 = WRAPLOW(dct_const_round_shift(s7));
   1829 
   1830   output[0] = WRAPLOW(x0);
   1831   output[1] = WRAPLOW(-x4);
   1832   output[2] = WRAPLOW(x6);
   1833   output[3] = WRAPLOW(-x2);
   1834   output[4] = WRAPLOW(x3);
   1835   output[5] = WRAPLOW(-x7);
   1836   output[6] = WRAPLOW(x5);
   1837   output[7] = WRAPLOW(-x1);
   1838 }
   1839 
   1840 static const high_transform_2d HIGH_IHT_8[] = {
   1841   { high_idct8,  high_idct8  },  // DCT_DCT  = 0
   1842   { high_iadst8, high_idct8  },  // ADST_DCT = 1
   1843   { high_idct8,  high_iadst8 },  // DCT_ADST = 2
   1844   { high_iadst8, high_iadst8 }   // ADST_ADST = 3
   1845 };
   1846 
   1847 void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
   1848                               int stride, int tx_type, int bd) {
   1849   int i, j;
   1850   tran_low_t out[8 * 8];
   1851   tran_low_t *outptr = out;
   1852   tran_low_t temp_in[8], temp_out[8];
   1853   const high_transform_2d ht = HIGH_IHT_8[tx_type];
   1854   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1855 
   1856   // Inverse transform row vectors.
   1857   for (i = 0; i < 8; ++i) {
   1858     ht.rows(input, outptr, bd);
   1859     input += 8;
   1860     outptr += 8;
   1861   }
   1862 
   1863   // Inverse transform column vectors.
   1864   for (i = 0; i < 8; ++i) {
   1865     for (j = 0; j < 8; ++j)
   1866       temp_in[j] = out[j * 8 + i];
   1867     ht.cols(temp_in, temp_out, bd);
   1868     for (j = 0; j < 8; ++j)
   1869       dest[j * stride + i] = clip_pixel_bd_high(
   1870           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1871   }
   1872 }
   1873 
   1874 void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
   1875                                int stride, int bd) {
   1876   tran_low_t out[8 * 8] = { 0 };
   1877   tran_low_t *outptr = out;
   1878   int i, j;
   1879   tran_low_t temp_in[8], temp_out[8];
   1880   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1881 
   1882   // First transform rows.
   1883   // Only first 4 row has non-zero coefs.
   1884   for (i = 0; i < 4; ++i) {
   1885     high_idct8(input, outptr, bd);
   1886     input += 8;
   1887     outptr += 8;
   1888   }
   1889   // Then transform columns.
   1890   for (i = 0; i < 8; ++i) {
   1891     for (j = 0; j < 8; ++j)
   1892       temp_in[j] = out[j * 8 + i];
   1893     high_idct8(temp_in, temp_out, bd);
   1894     for (j = 0; j < 8; ++j)
   1895       dest[j * stride + i] = clip_pixel_bd_high(
   1896           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1897   }
   1898 }
   1899 
   1900 static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   1901   tran_low_t step1[16], step2[16];
   1902   tran_high_t temp1, temp2;
   1903   (void) bd;
   1904 
   1905   // stage 1
   1906   step1[0] = input[0/2];
   1907   step1[1] = input[16/2];
   1908   step1[2] = input[8/2];
   1909   step1[3] = input[24/2];
   1910   step1[4] = input[4/2];
   1911   step1[5] = input[20/2];
   1912   step1[6] = input[12/2];
   1913   step1[7] = input[28/2];
   1914   step1[8] = input[2/2];
   1915   step1[9] = input[18/2];
   1916   step1[10] = input[10/2];
   1917   step1[11] = input[26/2];
   1918   step1[12] = input[6/2];
   1919   step1[13] = input[22/2];
   1920   step1[14] = input[14/2];
   1921   step1[15] = input[30/2];
   1922 
   1923   // stage 2
   1924   step2[0] = step1[0];
   1925   step2[1] = step1[1];
   1926   step2[2] = step1[2];
   1927   step2[3] = step1[3];
   1928   step2[4] = step1[4];
   1929   step2[5] = step1[5];
   1930   step2[6] = step1[6];
   1931   step2[7] = step1[7];
   1932 
   1933   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   1934   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   1935   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
   1936   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
   1937 
   1938   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   1939   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   1940   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
   1941   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   1942 
   1943   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   1944   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   1945   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   1946   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   1947 
   1948   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   1949   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   1950   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
   1951   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   1952 
   1953   // stage 3
   1954   step1[0] = step2[0];
   1955   step1[1] = step2[1];
   1956   step1[2] = step2[2];
   1957   step1[3] = step2[3];
   1958 
   1959   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   1960   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   1961   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
   1962   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   1963   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   1964   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   1965   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   1966   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   1967 
   1968   step1[8] = WRAPLOW(step2[8] + step2[9]);
   1969   step1[9] = WRAPLOW(step2[8] - step2[9]);
   1970   step1[10] = WRAPLOW(-step2[10] + step2[11]);
   1971   step1[11] = WRAPLOW(step2[10] + step2[11]);
   1972   step1[12] = WRAPLOW(step2[12] + step2[13]);
   1973   step1[13] = WRAPLOW(step2[12] - step2[13]);
   1974   step1[14] = WRAPLOW(-step2[14] + step2[15]);
   1975   step1[15] = WRAPLOW(step2[14] + step2[15]);
   1976 
   1977   // stage 4
   1978   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   1979   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   1980   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
   1981   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   1982   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   1983   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   1984   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
   1985   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
   1986   step2[4] = WRAPLOW(step1[4] + step1[5]);
   1987   step2[5] = WRAPLOW(step1[4] - step1[5]);
   1988   step2[6] = WRAPLOW(-step1[6] + step1[7]);
   1989   step2[7] = WRAPLOW(step1[6] + step1[7]);
   1990 
   1991   step2[8] = step1[8];
   1992   step2[15] = step1[15];
   1993   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   1994   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   1995   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
   1996   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   1997   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   1998   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   1999   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   2000   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   2001   step2[11] = step1[11];
   2002   step2[12] = step1[12];
   2003 
   2004   // stage 5
   2005   step1[0] = WRAPLOW(step2[0] + step2[3]);
   2006   step1[1] = WRAPLOW(step2[1] + step2[2]);
   2007   step1[2] = WRAPLOW(step2[1] - step2[2]);
   2008   step1[3] = WRAPLOW(step2[0] - step2[3]);
   2009   step1[4] = step2[4];
   2010   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   2011   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   2012   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   2013   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   2014   step1[7] = step2[7];
   2015 
   2016   step1[8] = WRAPLOW(step2[8] + step2[11]);
   2017   step1[9] = WRAPLOW(step2[9] + step2[10]);
   2018   step1[10] = WRAPLOW(step2[9] - step2[10]);
   2019   step1[11] = WRAPLOW(step2[8] - step2[11]);
   2020   step1[12] = WRAPLOW(-step2[12] + step2[15]);
   2021   step1[13] = WRAPLOW(-step2[13] + step2[14]);
   2022   step1[14] = WRAPLOW(step2[13] + step2[14]);
   2023   step1[15] = WRAPLOW(step2[12] + step2[15]);
   2024 
   2025   // stage 6
   2026   step2[0] = WRAPLOW(step1[0] + step1[7]);
   2027   step2[1] = WRAPLOW(step1[1] + step1[6]);
   2028   step2[2] = WRAPLOW(step1[2] + step1[5]);
   2029   step2[3] = WRAPLOW(step1[3] + step1[4]);
   2030   step2[4] = WRAPLOW(step1[3] - step1[4]);
   2031   step2[5] = WRAPLOW(step1[2] - step1[5]);
   2032   step2[6] = WRAPLOW(step1[1] - step1[6]);
   2033   step2[7] = WRAPLOW(step1[0] - step1[7]);
   2034   step2[8] = step1[8];
   2035   step2[9] = step1[9];
   2036   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   2037   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   2038   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   2039   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   2040   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   2041   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   2042   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
   2043   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   2044   step2[14] = step1[14];
   2045   step2[15] = step1[15];
   2046 
   2047   // stage 7
   2048   output[0] = WRAPLOW(step2[0] + step2[15]);
   2049   output[1] = WRAPLOW(step2[1] + step2[14]);
   2050   output[2] = WRAPLOW(step2[2] + step2[13]);
   2051   output[3] = WRAPLOW(step2[3] + step2[12]);
   2052   output[4] = WRAPLOW(step2[4] + step2[11]);
   2053   output[5] = WRAPLOW(step2[5] + step2[10]);
   2054   output[6] = WRAPLOW(step2[6] + step2[9]);
   2055   output[7] = WRAPLOW(step2[7] + step2[8]);
   2056   output[8] = WRAPLOW(step2[7] - step2[8]);
   2057   output[9] = WRAPLOW(step2[6] - step2[9]);
   2058   output[10] = WRAPLOW(step2[5] - step2[10]);
   2059   output[11] = WRAPLOW(step2[4] - step2[11]);
   2060   output[12] = WRAPLOW(step2[3] - step2[12]);
   2061   output[13] = WRAPLOW(step2[2] - step2[13]);
   2062   output[14] = WRAPLOW(step2[1] - step2[14]);
   2063   output[15] = WRAPLOW(step2[0] - step2[15]);
   2064 }
   2065 
   2066 void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   2067                                   int stride, int bd) {
   2068   tran_low_t out[16 * 16];
   2069   tran_low_t *outptr = out;
   2070   int i, j;
   2071   tran_low_t temp_in[16], temp_out[16];
   2072   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2073 
   2074   // First transform rows.
   2075   for (i = 0; i < 16; ++i) {
   2076     high_idct16(input, outptr, bd);
   2077     input += 16;
   2078     outptr += 16;
   2079   }
   2080 
   2081   // Then transform columns.
   2082   for (i = 0; i < 16; ++i) {
   2083     for (j = 0; j < 16; ++j)
   2084       temp_in[j] = out[j * 16 + i];
   2085     high_idct16(temp_in, temp_out, bd);
   2086     for (j = 0; j < 16; ++j)
   2087       dest[j * stride + i] = clip_pixel_bd_high(
   2088           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2089   }
   2090 }
   2091 
   2092 static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
   2093   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   2094   tran_high_t s9, s10, s11, s12, s13, s14, s15;
   2095 
   2096   tran_high_t x0 = input[15];
   2097   tran_high_t x1 = input[0];
   2098   tran_high_t x2 = input[13];
   2099   tran_high_t x3 = input[2];
   2100   tran_high_t x4 = input[11];
   2101   tran_high_t x5 = input[4];
   2102   tran_high_t x6 = input[9];
   2103   tran_high_t x7 = input[6];
   2104   tran_high_t x8 = input[7];
   2105   tran_high_t x9 = input[8];
   2106   tran_high_t x10 = input[5];
   2107   tran_high_t x11 = input[10];
   2108   tran_high_t x12 = input[3];
   2109   tran_high_t x13 = input[12];
   2110   tran_high_t x14 = input[1];
   2111   tran_high_t x15 = input[14];
   2112   (void) bd;
   2113 
   2114   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
   2115            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
   2116     vpx_memset(output, 0, 16 * sizeof(*output));
   2117     return;
   2118   }
   2119 
   2120   // stage 1
   2121   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
   2122   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   2123   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
   2124   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   2125   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
   2126   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   2127   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   2128   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   2129   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   2130   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   2131   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   2132   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   2133   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   2134   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
   2135   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   2136   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
   2137 
   2138   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
   2139   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
   2140   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
   2141   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
   2142   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
   2143   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
   2144   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
   2145   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
   2146   x8  = WRAPLOW(dct_const_round_shift(s0 - s8));
   2147   x9  = WRAPLOW(dct_const_round_shift(s1 - s9));
   2148   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
   2149   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
   2150   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
   2151   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
   2152   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
   2153   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
   2154 
   2155   // stage 2
   2156   s0 = x0;
   2157   s1 = x1;
   2158   s2 = x2;
   2159   s3 = x3;
   2160   s4 = x4;
   2161   s5 = x5;
   2162   s6 = x6;
   2163   s7 = x7;
   2164   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
   2165   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
   2166   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
   2167   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
   2168   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
   2169   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
   2170   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   2171   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
   2172 
   2173   x0 = WRAPLOW(s0 + s4);
   2174   x1 = WRAPLOW(s1 + s5);
   2175   x2 = WRAPLOW(s2 + s6);
   2176   x3 = WRAPLOW(s3 + s7);
   2177   x4 = WRAPLOW(s0 - s4);
   2178   x5 = WRAPLOW(s1 - s5);
   2179   x6 = WRAPLOW(s2 - s6);
   2180   x7 = WRAPLOW(s3 - s7);
   2181   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
   2182   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
   2183   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
   2184   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
   2185   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
   2186   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
   2187   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
   2188   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
   2189 
   2190   // stage 3
   2191   s0 = x0;
   2192   s1 = x1;
   2193   s2 = x2;
   2194   s3 = x3;
   2195   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
   2196   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   2197   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
   2198   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
   2199   s8 = x8;
   2200   s9 = x9;
   2201   s10 = x10;
   2202   s11 = x11;
   2203   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
   2204   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   2205   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   2206   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
   2207 
   2208   x0 = WRAPLOW(s0 + s2);
   2209   x1 = WRAPLOW(s1 + s3);
   2210   x2 = WRAPLOW(s0 - s2);
   2211   x3 = WRAPLOW(s1 - s3);
   2212   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
   2213   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
   2214   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
   2215   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
   2216   x8 = WRAPLOW(s8 + s10);
   2217   x9 = WRAPLOW(s9 + s11);
   2218   x10 = WRAPLOW(s8 - s10);
   2219   x11 = WRAPLOW(s9 - s11);
   2220   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
   2221   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
   2222   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
   2223   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
   2224 
   2225   // stage 4
   2226   s2 = (- cospi_16_64) * (x2 + x3);
   2227   s3 = cospi_16_64 * (x2 - x3);
   2228   s6 = cospi_16_64 * (x6 + x7);
   2229   s7 = cospi_16_64 * (-x6 + x7);
   2230   s10 = cospi_16_64 * (x10 + x11);
   2231   s11 = cospi_16_64 * (-x10 + x11);
   2232   s14 = (- cospi_16_64) * (x14 + x15);
   2233   s15 = cospi_16_64 * (x14 - x15);
   2234 
   2235   x2 = WRAPLOW(dct_const_round_shift(s2));
   2236   x3 = WRAPLOW(dct_const_round_shift(s3));
   2237   x6 = WRAPLOW(dct_const_round_shift(s6));
   2238   x7 = WRAPLOW(dct_const_round_shift(s7));
   2239   x10 = WRAPLOW(dct_const_round_shift(s10));
   2240   x11 = WRAPLOW(dct_const_round_shift(s11));
   2241   x14 = WRAPLOW(dct_const_round_shift(s14));
   2242   x15 = WRAPLOW(dct_const_round_shift(s15));
   2243 
   2244   output[0] = WRAPLOW(x0);
   2245   output[1] = WRAPLOW(-x8);
   2246   output[2] = WRAPLOW(x12);
   2247   output[3] = WRAPLOW(-x4);
   2248   output[4] = WRAPLOW(x6);
   2249   output[5] = WRAPLOW(x14);
   2250   output[6] = WRAPLOW(x10);
   2251   output[7] = WRAPLOW(x2);
   2252   output[8] = WRAPLOW(x3);
   2253   output[9] = WRAPLOW(x11);
   2254   output[10] = WRAPLOW(x15);
   2255   output[11] = WRAPLOW(x7);
   2256   output[12] = WRAPLOW(x5);
   2257   output[13] = WRAPLOW(-x13);
   2258   output[14] = WRAPLOW(x9);
   2259   output[15] = WRAPLOW(-x1);
   2260 }
   2261 
   2262 static const high_transform_2d HIGH_IHT_16[] = {
   2263   { high_idct16,  high_idct16  },  // DCT_DCT  = 0
   2264   { high_iadst16, high_idct16  },  // ADST_DCT = 1
   2265   { high_idct16,  high_iadst16 },  // DCT_ADST = 2
   2266   { high_iadst16, high_iadst16 }   // ADST_ADST = 3
   2267 };
   2268 
   2269 void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   2270                                  int stride, int tx_type, int bd) {
   2271   int i, j;
   2272   tran_low_t out[16 * 16];
   2273   tran_low_t *outptr = out;
   2274   tran_low_t temp_in[16], temp_out[16];
   2275   const high_transform_2d ht = HIGH_IHT_16[tx_type];
   2276   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2277 
   2278   // Rows
   2279   for (i = 0; i < 16; ++i) {
   2280     ht.rows(input, outptr, bd);
   2281     input += 16;
   2282     outptr += 16;
   2283   }
   2284 
   2285   // Columns
   2286   for (i = 0; i < 16; ++i) {
   2287     for (j = 0; j < 16; ++j)
   2288       temp_in[j] = out[j * 16 + i];
   2289     ht.cols(temp_in, temp_out, bd);
   2290     for (j = 0; j < 16; ++j)
   2291       dest[j * stride + i] = clip_pixel_bd_high(
   2292           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2293   }
   2294 }
   2295 
   2296 void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
   2297                                  int stride, int bd) {
   2298   tran_low_t out[16 * 16] = { 0 };
   2299   tran_low_t *outptr = out;
   2300   int i, j;
   2301   tran_low_t temp_in[16], temp_out[16];
   2302   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2303 
   2304   // First transform rows. Since all non-zero dct coefficients are in
   2305   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   2306   for (i = 0; i < 4; ++i) {
   2307     high_idct16(input, outptr, bd);
   2308     input += 16;
   2309     outptr += 16;
   2310   }
   2311 
   2312   // Then transform columns.
   2313   for (i = 0; i < 16; ++i) {
   2314     for (j = 0; j < 16; ++j)
   2315       temp_in[j] = out[j*16 + i];
   2316     high_idct16(temp_in, temp_out, bd);
   2317     for (j = 0; j < 16; ++j)
   2318       dest[j * stride + i] = clip_pixel_bd_high(
   2319           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2320   }
   2321 }
   2322 
   2323 void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
   2324                                 int stride, int bd) {
   2325   int i, j;
   2326   tran_high_t a1;
   2327   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
   2328   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2329 
   2330   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   2331   a1 = ROUND_POWER_OF_TWO(out, 6);
   2332   for (j = 0; j < 16; ++j) {
   2333     for (i = 0; i < 16; ++i)
   2334       dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
   2335     dest += stride;
   2336   }
   2337 }
   2338 
   2339 static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   2340   tran_low_t step1[32], step2[32];
   2341   tran_high_t temp1, temp2;
   2342   (void) bd;
   2343 
   2344   // stage 1
   2345   step1[0] = input[0];
   2346   step1[1] = input[16];
   2347   step1[2] = input[8];
   2348   step1[3] = input[24];
   2349   step1[4] = input[4];
   2350   step1[5] = input[20];
   2351   step1[6] = input[12];
   2352   step1[7] = input[28];
   2353   step1[8] = input[2];
   2354   step1[9] = input[18];
   2355   step1[10] = input[10];
   2356   step1[11] = input[26];
   2357   step1[12] = input[6];
   2358   step1[13] = input[22];
   2359   step1[14] = input[14];
   2360   step1[15] = input[30];
   2361 
   2362   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   2363   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
   2364   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
   2365   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
   2366 
   2367   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   2368   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
   2369   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
   2370   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   2371 
   2372   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   2373   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
   2374   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   2375   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   2376 
   2377   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   2378   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
   2379   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   2380   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   2381 
   2382   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   2383   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
   2384   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   2385   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   2386 
   2387   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   2388   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
   2389   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   2390   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   2391 
   2392   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   2393   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
   2394   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   2395   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   2396 
   2397   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   2398   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
   2399   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   2400   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   2401 
   2402   // stage 2
   2403   step2[0] = step1[0];
   2404   step2[1] = step1[1];
   2405   step2[2] = step1[2];
   2406   step2[3] = step1[3];
   2407   step2[4] = step1[4];
   2408   step2[5] = step1[5];
   2409   step2[6] = step1[6];
   2410   step2[7] = step1[7];
   2411 
   2412   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   2413   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   2414   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
   2415   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
   2416 
   2417   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   2418   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   2419   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
   2420   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   2421 
   2422   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   2423   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   2424   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   2425   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   2426 
   2427   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   2428   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   2429   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
   2430   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   2431 
   2432   step2[16] = WRAPLOW(step1[16] + step1[17]);
   2433   step2[17] = WRAPLOW(step1[16] - step1[17]);
   2434   step2[18] = WRAPLOW(-step1[18] + step1[19]);
   2435   step2[19] = WRAPLOW(step1[18] + step1[19]);
   2436   step2[20] = WRAPLOW(step1[20] + step1[21]);
   2437   step2[21] = WRAPLOW(step1[20] - step1[21]);
   2438   step2[22] = WRAPLOW(-step1[22] + step1[23]);
   2439   step2[23] = WRAPLOW(step1[22] + step1[23]);
   2440   step2[24] = WRAPLOW(step1[24] + step1[25]);
   2441   step2[25] = WRAPLOW(step1[24] - step1[25]);
   2442   step2[26] = WRAPLOW(-step1[26] + step1[27]);
   2443   step2[27] = WRAPLOW(step1[26] + step1[27]);
   2444   step2[28] = WRAPLOW(step1[28] + step1[29]);
   2445   step2[29] = WRAPLOW(step1[28] - step1[29]);
   2446   step2[30] = WRAPLOW(-step1[30] + step1[31]);
   2447   step2[31] = WRAPLOW(step1[30] + step1[31]);
   2448 
   2449   // stage 3
   2450   step1[0] = step2[0];
   2451   step1[1] = step2[1];
   2452   step1[2] = step2[2];
   2453   step1[3] = step2[3];
   2454 
   2455   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   2456   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   2457   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
   2458   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   2459   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   2460   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   2461   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   2462   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   2463 
   2464   step1[8] = WRAPLOW(step2[8] + step2[9]);
   2465   step1[9] = WRAPLOW(step2[8] - step2[9]);
   2466   step1[10] = WRAPLOW(-step2[10] + step2[11]);
   2467   step1[11] = WRAPLOW(step2[10] + step2[11]);
   2468   step1[12] = WRAPLOW(step2[12] + step2[13]);
   2469   step1[13] = WRAPLOW(step2[12] - step2[13]);
   2470   step1[14] = WRAPLOW(-step2[14] + step2[15]);
   2471   step1[15] = WRAPLOW(step2[14] + step2[15]);
   2472 
   2473   step1[16] = step2[16];
   2474   step1[31] = step2[31];
   2475   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   2476   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
   2477   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
   2478   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   2479   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   2480   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
   2481   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   2482   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   2483   step1[19] = step2[19];
   2484   step1[20] = step2[20];
   2485   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   2486   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
   2487   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   2488   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   2489   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   2490   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
   2491   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   2492   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   2493   step1[23] = step2[23];
   2494   step1[24] = step2[24];
   2495   step1[27] = step2[27];
   2496   step1[28] = step2[28];
   2497 
   2498   // stage 4
   2499   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   2500   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   2501   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
   2502   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   2503   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   2504   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   2505   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
   2506   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
   2507   step2[4] = WRAPLOW(step1[4] + step1[5]);
   2508   step2[5] = WRAPLOW(step1[4] - step1[5]);
   2509   step2[6] = WRAPLOW(-step1[6] + step1[7]);
   2510   step2[7] = WRAPLOW(step1[6] + step1[7]);
   2511 
   2512   step2[8] = step1[8];
   2513   step2[15] = step1[15];
   2514   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   2515   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   2516   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
   2517   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   2518   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   2519   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   2520   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   2521   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   2522   step2[11] = step1[11];
   2523   step2[12] = step1[12];
   2524 
   2525   step2[16] = WRAPLOW(step1[16] + step1[19]);
   2526   step2[17] = WRAPLOW(step1[17] + step1[18]);
   2527   step2[18] = WRAPLOW(step1[17] - step1[18]);
   2528   step2[19] = WRAPLOW(step1[16] - step1[19]);
   2529   step2[20] = WRAPLOW(-step1[20] + step1[23]);
   2530   step2[21] = WRAPLOW(-step1[21] + step1[22]);
   2531   step2[22] = WRAPLOW(step1[21] + step1[22]);
   2532   step2[23] = WRAPLOW(step1[20] + step1[23]);
   2533 
   2534   step2[24] = WRAPLOW(step1[24] + step1[27]);
   2535   step2[25] = WRAPLOW(step1[25] + step1[26]);
   2536   step2[26] = WRAPLOW(step1[25] - step1[26]);
   2537   step2[27] = WRAPLOW(step1[24] - step1[27]);
   2538   step2[28] = WRAPLOW(-step1[28] + step1[31]);
   2539   step2[29] = WRAPLOW(-step1[29] + step1[30]);
   2540   step2[30] = WRAPLOW(step1[29] + step1[30]);
   2541   step2[31] = WRAPLOW(step1[28] + step1[31]);
   2542 
   2543   // stage 5
   2544   step1[0] = WRAPLOW(step2[0] + step2[3]);
   2545   step1[1] = WRAPLOW(step2[1] + step2[2]);
   2546   step1[2] = WRAPLOW(step2[1] - step2[2]);
   2547   step1[3] = WRAPLOW(step2[0] - step2[3]);
   2548   step1[4] = step2[4];
   2549   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   2550   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   2551   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   2552   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   2553   step1[7] = step2[7];
   2554 
   2555   step1[8] = WRAPLOW(step2[8] + step2[11]);
   2556   step1[9] = WRAPLOW(step2[9] + step2[10]);
   2557   step1[10] = WRAPLOW(step2[9] - step2[10]);
   2558   step1[11] = WRAPLOW(step2[8] - step2[11]);
   2559   step1[12] = WRAPLOW(-step2[12] + step2[15]);
   2560   step1[13] = WRAPLOW(-step2[13] + step2[14]);
   2561   step1[14] = WRAPLOW(step2[13] + step2[14]);
   2562   step1[15] = WRAPLOW(step2[12] + step2[15]);
   2563 
   2564   step1[16] = step2[16];
   2565   step1[17] = step2[17];
   2566   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   2567   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   2568   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   2569   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   2570   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   2571   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   2572   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   2573   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   2574   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   2575   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   2576   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   2577   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   2578   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   2579   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   2580   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   2581   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   2582   step1[22] = step2[22];
   2583   step1[23] = step2[23];
   2584   step1[24] = step2[24];
   2585   step1[25] = step2[25];
   2586   step1[30] = step2[30];
   2587   step1[31] = step2[31];
   2588 
   2589   // stage 6
   2590   step2[0] = WRAPLOW(step1[0] + step1[7]);
   2591   step2[1] = WRAPLOW(step1[1] + step1[6]);
   2592   step2[2] = WRAPLOW(step1[2] + step1[5]);
   2593   step2[3] = WRAPLOW(step1[3] + step1[4]);
   2594   step2[4] = WRAPLOW(step1[3] - step1[4]);
   2595   step2[5] = WRAPLOW(step1[2] - step1[5]);
   2596   step2[6] = WRAPLOW(step1[1] - step1[6]);
   2597   step2[7] = WRAPLOW(step1[0] - step1[7]);
   2598   step2[8] = step1[8];
   2599   step2[9] = step1[9];
   2600   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   2601   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   2602   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   2603   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   2604   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   2605   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   2606   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
   2607   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   2608   step2[14] = WRAPLOW(step1[14]);
   2609   step2[15] = WRAPLOW(step1[15]);
   2610 
   2611   step2[16] = WRAPLOW(step1[16] + step1[23]);
   2612   step2[17] = WRAPLOW(step1[17] + step1[22]);
   2613   step2[18] = WRAPLOW(step1[18] + step1[21]);
   2614   step2[19] = WRAPLOW(step1[19] + step1[20]);
   2615   step2[20] = WRAPLOW(step1[19] - step1[20]);
   2616   step2[21] = WRAPLOW(step1[18] - step1[21]);
   2617   step2[22] = WRAPLOW(step1[17] - step1[22]);
   2618   step2[23] = WRAPLOW(step1[16] - step1[23]);
   2619 
   2620   step2[24] = WRAPLOW(-step1[24] + step1[31]);
   2621   step2[25] = WRAPLOW(-step1[25] + step1[30]);
   2622   step2[26] = WRAPLOW(-step1[26] + step1[29]);
   2623   step2[27] = WRAPLOW(-step1[27] + step1[28]);
   2624   step2[28] = WRAPLOW(step1[27] + step1[28]);
   2625   step2[29] = WRAPLOW(step1[26] + step1[29]);
   2626   step2[30] = WRAPLOW(step1[25] + step1[30]);
   2627   step2[31] = WRAPLOW(step1[24] + step1[31]);
   2628 
   2629   // stage 7
   2630   step1[0] = WRAPLOW(step2[0] + step2[15]);
   2631   step1[1] = WRAPLOW(step2[1] + step2[14]);
   2632   step1[2] = WRAPLOW(step2[2] + step2[13]);
   2633   step1[3] = WRAPLOW(step2[3] + step2[12]);
   2634   step1[4] = WRAPLOW(step2[4] + step2[11]);
   2635   step1[5] = WRAPLOW(step2[5] + step2[10]);
   2636   step1[6] = WRAPLOW(step2[6] + step2[9]);
   2637   step1[7] = WRAPLOW(step2[7] + step2[8]);
   2638   step1[8] = WRAPLOW(step2[7] - step2[8]);
   2639   step1[9] = WRAPLOW(step2[6] - step2[9]);
   2640   step1[10] = WRAPLOW(step2[5] - step2[10]);
   2641   step1[11] = WRAPLOW(step2[4] - step2[11]);
   2642   step1[12] = WRAPLOW(step2[3] - step2[12]);
   2643   step1[13] = WRAPLOW(step2[2] - step2[13]);
   2644   step1[14] = WRAPLOW(step2[1] - step2[14]);
   2645   step1[15] = WRAPLOW(step2[0] - step2[15]);
   2646 
   2647   step1[16] = step2[16];
   2648   step1[17] = step2[17];
   2649   step1[18] = step2[18];
   2650   step1[19] = step2[19];
   2651   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   2652   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   2653   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   2654   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   2655   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   2656   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   2657   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   2658   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   2659   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   2660   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   2661   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   2662   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   2663   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   2664   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   2665   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   2666   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   2667   step1[28] = step2[28];
   2668   step1[29] = step2[29];
   2669   step1[30] = step2[30];
   2670   step1[31] = step2[31];
   2671 
   2672   // final stage
   2673   output[0] = WRAPLOW(step1[0] + step1[31]);
   2674   output[1] = WRAPLOW(step1[1] + step1[30]);
   2675   output[2] = WRAPLOW(step1[2] + step1[29]);
   2676   output[3] = WRAPLOW(step1[3] + step1[28]);
   2677   output[4] = WRAPLOW(step1[4] + step1[27]);
   2678   output[5] = WRAPLOW(step1[5] + step1[26]);
   2679   output[6] = WRAPLOW(step1[6] + step1[25]);
   2680   output[7] = WRAPLOW(step1[7] + step1[24]);
   2681   output[8] = WRAPLOW(step1[8] + step1[23]);
   2682   output[9] = WRAPLOW(step1[9] + step1[22]);
   2683   output[10] = WRAPLOW(step1[10] + step1[21]);
   2684   output[11] = WRAPLOW(step1[11] + step1[20]);
   2685   output[12] = WRAPLOW(step1[12] + step1[19]);
   2686   output[13] = WRAPLOW(step1[13] + step1[18]);
   2687   output[14] = WRAPLOW(step1[14] + step1[17]);
   2688   output[15] = WRAPLOW(step1[15] + step1[16]);
   2689   output[16] = WRAPLOW(step1[15] - step1[16]);
   2690   output[17] = WRAPLOW(step1[14] - step1[17]);
   2691   output[18] = WRAPLOW(step1[13] - step1[18]);
   2692   output[19] = WRAPLOW(step1[12] - step1[19]);
   2693   output[20] = WRAPLOW(step1[11] - step1[20]);
   2694   output[21] = WRAPLOW(step1[10] - step1[21]);
   2695   output[22] = WRAPLOW(step1[9] - step1[22]);
   2696   output[23] = WRAPLOW(step1[8] - step1[23]);
   2697   output[24] = WRAPLOW(step1[7] - step1[24]);
   2698   output[25] = WRAPLOW(step1[6] - step1[25]);
   2699   output[26] = WRAPLOW(step1[5] - step1[26]);
   2700   output[27] = WRAPLOW(step1[4] - step1[27]);
   2701   output[28] = WRAPLOW(step1[3] - step1[28]);
   2702   output[29] = WRAPLOW(step1[2] - step1[29]);
   2703   output[30] = WRAPLOW(step1[1] - step1[30]);
   2704   output[31] = WRAPLOW(step1[0] - step1[31]);
   2705 }
   2706 
   2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
   2708                                    int stride, int bd) {
   2709   tran_low_t out[32 * 32];
   2710   tran_low_t *outptr = out;
   2711   int i, j;
   2712   tran_low_t temp_in[32], temp_out[32];
   2713   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2714 
   2715   // Rows
   2716   for (i = 0; i < 32; ++i) {
   2717     tran_low_t zero_coeff[16];
   2718     for (j = 0; j < 16; ++j)
   2719       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
   2720     for (j = 0; j < 8; ++j)
   2721       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   2722     for (j = 0; j < 4; ++j)
   2723       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   2724     for (j = 0; j < 2; ++j)
   2725       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   2726 
   2727     if (zero_coeff[0] | zero_coeff[1])
   2728       high_idct32(input, outptr, bd);
   2729     else
   2730       vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
   2731     input += 32;
   2732     outptr += 32;
   2733   }
   2734 
   2735   // Columns
   2736   for (i = 0; i < 32; ++i) {
   2737     for (j = 0; j < 32; ++j)
   2738       temp_in[j] = out[j * 32 + i];
   2739     high_idct32(temp_in, temp_out, bd);
   2740     for (j = 0; j < 32; ++j)
   2741       dest[j * stride + i] = clip_pixel_bd_high(
   2742           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2743   }
   2744 }
   2745 
   2746 void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   2747                                  int stride, int bd) {
   2748   tran_low_t out[32 * 32] = {0};
   2749   tran_low_t *outptr = out;
   2750   int i, j;
   2751   tran_low_t temp_in[32], temp_out[32];
   2752   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2753 
   2754   // Rows
   2755   // Only upper-left 8x8 has non-zero coeff.
   2756   for (i = 0; i < 8; ++i) {
   2757     high_idct32(input, outptr, bd);
   2758     input += 32;
   2759     outptr += 32;
   2760   }
   2761   // Columns
   2762   for (i = 0; i < 32; ++i) {
   2763     for (j = 0; j < 32; ++j)
   2764       temp_in[j] = out[j * 32 + i];
   2765     high_idct32(temp_in, temp_out, bd);
   2766     for (j = 0; j < 32; ++j)
   2767       dest[j * stride + i] = clip_pixel_bd_high(
   2768           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2769   }
   2770 }
   2771 
   2772 void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   2773                                 int stride, int bd) {
   2774   int i, j;
   2775   int a1;
   2776   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2777 
   2778   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
   2779   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   2780   a1 = ROUND_POWER_OF_TWO(out, 6);
   2781 
   2782   for (j = 0; j < 32; ++j) {
   2783     for (i = 0; i < 32; ++i)
   2784       dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
   2785     dest += stride;
   2786   }
   2787 }
   2788 
   2789 // idct
   2790 void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
   2791                           int eob, int bd) {
   2792   if (eob > 1)
   2793     vp9_high_idct4x4_16_add(input, dest, stride, bd);
   2794   else
   2795     vp9_high_idct4x4_1_add(input, dest, stride, bd);
   2796 }
   2797 
   2798 
   2799 void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
   2800                           int eob, int bd) {
   2801   if (eob > 1)
   2802     vp9_high_iwht4x4_16_add(input, dest, stride, bd);
   2803   else
   2804     vp9_high_iwht4x4_1_add(input, dest, stride, bd);
   2805 }
   2806 
   2807 void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   2808                           int eob, int bd) {
   2809   // If dc is 1, then input[0] is the reconstructed value, do not need
   2810   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
   2811 
   2812   // The calculation can be simplified if there are not many non-zero dct
   2813   // coefficients. Use eobs to decide what to do.
   2814   // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
   2815   // Combine that with code here.
   2816   // DC only DCT coefficient
   2817   if (eob == 1) {
   2818     vp9_high_idct8x8_1_add(input, dest, stride, bd);
   2819   } else if (eob <= 10) {
   2820     vp9_high_idct8x8_10_add(input, dest, stride, bd);
   2821   } else {
   2822     vp9_high_idct8x8_64_add(input, dest, stride, bd);
   2823   }
   2824 }
   2825 
   2826 void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
   2827                        int eob, int bd) {
   2828   // The calculation can be simplified if there are not many non-zero dct
   2829   // coefficients. Use eobs to separate different cases.
   2830   // DC only DCT coefficient.
   2831   if (eob == 1) {
   2832     vp9_high_idct16x16_1_add(input, dest, stride, bd);
   2833   } else if (eob <= 10) {
   2834     vp9_high_idct16x16_10_add(input, dest, stride, bd);
   2835   } else {
   2836     vp9_high_idct16x16_256_add(input, dest, stride, bd);
   2837   }
   2838 }
   2839 
   2840 void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
   2841                        int eob, int bd) {
   2842   // Non-zero coeff only in upper-left 8x8
   2843   if (eob == 1) {
   2844     vp9_high_idct32x32_1_add(input, dest, stride, bd);
   2845   } else if (eob <= 34) {
   2846     vp9_high_idct32x32_34_add(input, dest, stride, bd);
   2847   } else {
   2848     vp9_high_idct32x32_1024_add(input, dest, stride, bd);
   2849   }
   2850 }
   2851 
   2852 // iht
   2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
   2854                          uint8_t *dest, int stride, int eob, int bd) {
   2855   if (tx_type == DCT_DCT)
   2856     vp9_high_idct4x4_add(input, dest, stride, eob, bd);
   2857   else
   2858     vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
   2859 }
   2860 
   2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
   2862                          uint8_t *dest, int stride, int eob, int bd) {
   2863   if (tx_type == DCT_DCT) {
   2864     vp9_high_idct8x8_add(input, dest, stride, eob, bd);
   2865   } else {
   2866     vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
   2867   }
   2868 }
   2869 
   2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
   2871                            uint8_t *dest, int stride, int eob, int bd) {
   2872   if (tx_type == DCT_DCT) {
   2873     vp9_high_idct16x16_add(input, dest, stride, eob, bd);
   2874   } else {
   2875     vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
   2876   }
   2877 }
   2878 #endif  // CONFIG_VP9_HIGHBITDEPTH
   2879