Home | History | Annotate | Download | only in vpx_dsp
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <math.h>
     12 #include <string.h>
     13 
     14 #include "vpx_dsp/inv_txfm.h"
     15 
     16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     18    0.5 shifts per pixel. */
     19   int i;
     20   tran_low_t output[16];
     21   tran_high_t a1, b1, c1, d1, e1;
     22   const tran_low_t *ip = input;
     23   tran_low_t *op = output;
     24 
     25   for (i = 0; i < 4; i++) {
     26     a1 = ip[0] >> UNIT_QUANT_SHIFT;
     27     c1 = ip[1] >> UNIT_QUANT_SHIFT;
     28     d1 = ip[2] >> UNIT_QUANT_SHIFT;
     29     b1 = ip[3] >> UNIT_QUANT_SHIFT;
     30     a1 += c1;
     31     d1 -= b1;
     32     e1 = (a1 - d1) >> 1;
     33     b1 = e1 - b1;
     34     c1 = e1 - c1;
     35     a1 -= b1;
     36     d1 += c1;
     37     op[0] = WRAPLOW(a1, 8);
     38     op[1] = WRAPLOW(b1, 8);
     39     op[2] = WRAPLOW(c1, 8);
     40     op[3] = WRAPLOW(d1, 8);
     41     ip += 4;
     42     op += 4;
     43   }
     44 
     45   ip = output;
     46   for (i = 0; i < 4; i++) {
     47     a1 = ip[4 * 0];
     48     c1 = ip[4 * 1];
     49     d1 = ip[4 * 2];
     50     b1 = ip[4 * 3];
     51     a1 += c1;
     52     d1 -= b1;
     53     e1 = (a1 - d1) >> 1;
     54     b1 = e1 - b1;
     55     c1 = e1 - c1;
     56     a1 -= b1;
     57     d1 += c1;
     58     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
     59     dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
     60     dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
     61     dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
     62 
     63     ip++;
     64     dest++;
     65   }
     66 }
     67 
     68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
     69   int i;
     70   tran_high_t a1, e1;
     71   tran_low_t tmp[4];
     72   const tran_low_t *ip = in;
     73   tran_low_t *op = tmp;
     74 
     75   a1 = ip[0] >> UNIT_QUANT_SHIFT;
     76   e1 = a1 >> 1;
     77   a1 -= e1;
     78   op[0] = WRAPLOW(a1, 8);
     79   op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
     80 
     81   ip = tmp;
     82   for (i = 0; i < 4; i++) {
     83     e1 = ip[0] >> 1;
     84     a1 = ip[0] - e1;
     85     dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
     86     dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
     87     dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
     88     dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
     89     ip++;
     90     dest++;
     91   }
     92 }
     93 
     94 void idct4_c(const tran_low_t *input, tran_low_t *output) {
     95   tran_low_t step[4];
     96   tran_high_t temp1, temp2;
     97   // stage 1
     98   temp1 = (input[0] + input[2]) * cospi_16_64;
     99   temp2 = (input[0] - input[2]) * cospi_16_64;
    100   step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
    101   step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
    102   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
    103   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
    104   step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
    105   step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
    106 
    107   // stage 2
    108   output[0] = WRAPLOW(step[0] + step[3], 8);
    109   output[1] = WRAPLOW(step[1] + step[2], 8);
    110   output[2] = WRAPLOW(step[1] - step[2], 8);
    111   output[3] = WRAPLOW(step[0] - step[3], 8);
    112 }
    113 
    114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    115   tran_low_t out[4 * 4];
    116   tran_low_t *outptr = out;
    117   int i, j;
    118   tran_low_t temp_in[4], temp_out[4];
    119 
    120   // Rows
    121   for (i = 0; i < 4; ++i) {
    122     idct4_c(input, outptr);
    123     input += 4;
    124     outptr += 4;
    125   }
    126 
    127   // Columns
    128   for (i = 0; i < 4; ++i) {
    129     for (j = 0; j < 4; ++j)
    130       temp_in[j] = out[j * 4 + i];
    131     idct4_c(temp_in, temp_out);
    132     for (j = 0; j < 4; ++j) {
    133       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    134                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
    135     }
    136   }
    137 }
    138 
    139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
    140                          int dest_stride) {
    141   int i;
    142   tran_high_t a1;
    143   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
    144   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
    145   a1 = ROUND_POWER_OF_TWO(out, 4);
    146 
    147   for (i = 0; i < 4; i++) {
    148     dest[0] = clip_pixel_add(dest[0], a1);
    149     dest[1] = clip_pixel_add(dest[1], a1);
    150     dest[2] = clip_pixel_add(dest[2], a1);
    151     dest[3] = clip_pixel_add(dest[3], a1);
    152     dest += dest_stride;
    153   }
    154 }
    155 
    156 void idct8_c(const tran_low_t *input, tran_low_t *output) {
    157   tran_low_t step1[8], step2[8];
    158   tran_high_t temp1, temp2;
    159   // stage 1
    160   step1[0] = input[0];
    161   step1[2] = input[4];
    162   step1[1] = input[2];
    163   step1[3] = input[6];
    164   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    165   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    166   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
    167   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
    168   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    169   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    170   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
    171   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
    172 
    173   // stage 2
    174   temp1 = (step1[0] + step1[2]) * cospi_16_64;
    175   temp2 = (step1[0] - step1[2]) * cospi_16_64;
    176   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
    177   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
    178   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
    179   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
    180   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
    181   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
    182   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
    183   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
    184   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
    185   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
    186 
    187   // stage 3
    188   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
    189   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
    190   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
    191   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
    192   step1[4] = step2[4];
    193   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    194   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    195   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
    196   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
    197   step1[7] = step2[7];
    198 
    199   // stage 4
    200   output[0] = WRAPLOW(step1[0] + step1[7], 8);
    201   output[1] = WRAPLOW(step1[1] + step1[6], 8);
    202   output[2] = WRAPLOW(step1[2] + step1[5], 8);
    203   output[3] = WRAPLOW(step1[3] + step1[4], 8);
    204   output[4] = WRAPLOW(step1[3] - step1[4], 8);
    205   output[5] = WRAPLOW(step1[2] - step1[5], 8);
    206   output[6] = WRAPLOW(step1[1] - step1[6], 8);
    207   output[7] = WRAPLOW(step1[0] - step1[7], 8);
    208 }
    209 
    210 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    211   tran_low_t out[8 * 8];
    212   tran_low_t *outptr = out;
    213   int i, j;
    214   tran_low_t temp_in[8], temp_out[8];
    215 
    216   // First transform rows
    217   for (i = 0; i < 8; ++i) {
    218     idct8_c(input, outptr);
    219     input += 8;
    220     outptr += 8;
    221   }
    222 
    223   // Then transform columns
    224   for (i = 0; i < 8; ++i) {
    225     for (j = 0; j < 8; ++j)
    226       temp_in[j] = out[j * 8 + i];
    227     idct8_c(temp_in, temp_out);
    228     for (j = 0; j < 8; ++j) {
    229       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    230                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
    231     }
    232   }
    233 }
    234 
    235 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    236   int i, j;
    237   tran_high_t a1;
    238   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
    239   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
    240   a1 = ROUND_POWER_OF_TWO(out, 5);
    241   for (j = 0; j < 8; ++j) {
    242     for (i = 0; i < 8; ++i)
    243       dest[i] = clip_pixel_add(dest[i], a1);
    244     dest += stride;
    245   }
    246 }
    247 
    248 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
    249   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    250 
    251   tran_low_t x0 = input[0];
    252   tran_low_t x1 = input[1];
    253   tran_low_t x2 = input[2];
    254   tran_low_t x3 = input[3];
    255 
    256   if (!(x0 | x1 | x2 | x3)) {
    257     output[0] = output[1] = output[2] = output[3] = 0;
    258     return;
    259   }
    260 
    261   s0 = sinpi_1_9 * x0;
    262   s1 = sinpi_2_9 * x0;
    263   s2 = sinpi_3_9 * x1;
    264   s3 = sinpi_4_9 * x2;
    265   s4 = sinpi_1_9 * x2;
    266   s5 = sinpi_2_9 * x3;
    267   s6 = sinpi_4_9 * x3;
    268   s7 = x0 - x2 + x3;
    269 
    270   s0 = s0 + s3 + s5;
    271   s1 = s1 - s4 - s6;
    272   s3 = s2;
    273   s2 = sinpi_3_9 * s7;
    274 
    275   // 1-D transform scaling factor is sqrt(2).
    276   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
    277   // + 1b (addition) = 29b.
    278   // Hence the output bit depth is 15b.
    279   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
    280   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
    281   output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
    282   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
    283 }
    284 
    285 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
    286   int s0, s1, s2, s3, s4, s5, s6, s7;
    287 
    288   tran_high_t x0 = input[7];
    289   tran_high_t x1 = input[0];
    290   tran_high_t x2 = input[5];
    291   tran_high_t x3 = input[2];
    292   tran_high_t x4 = input[3];
    293   tran_high_t x5 = input[4];
    294   tran_high_t x6 = input[1];
    295   tran_high_t x7 = input[6];
    296 
    297   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    298     output[0] = output[1] = output[2] = output[3] = output[4]
    299               = output[5] = output[6] = output[7] = 0;
    300     return;
    301   }
    302 
    303   // stage 1
    304   s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
    305   s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
    306   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
    307   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
    308   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
    309   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
    310   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
    311   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
    312 
    313   x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
    314   x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
    315   x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
    316   x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
    317   x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
    318   x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
    319   x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
    320   x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
    321 
    322   // stage 2
    323   s0 = (int)x0;
    324   s1 = (int)x1;
    325   s2 = (int)x2;
    326   s3 = (int)x3;
    327   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
    328   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
    329   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
    330   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
    331 
    332   x0 = WRAPLOW(s0 + s2, 8);
    333   x1 = WRAPLOW(s1 + s3, 8);
    334   x2 = WRAPLOW(s0 - s2, 8);
    335   x3 = WRAPLOW(s1 - s3, 8);
    336   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
    337   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
    338   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
    339   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
    340 
    341   // stage 3
    342   s2 = (int)(cospi_16_64 * (x2 + x3));
    343   s3 = (int)(cospi_16_64 * (x2 - x3));
    344   s6 = (int)(cospi_16_64 * (x6 + x7));
    345   s7 = (int)(cospi_16_64 * (x6 - x7));
    346 
    347   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
    348   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
    349   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
    350   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
    351 
    352   output[0] = WRAPLOW(x0, 8);
    353   output[1] = WRAPLOW(-x4, 8);
    354   output[2] = WRAPLOW(x6, 8);
    355   output[3] = WRAPLOW(-x2, 8);
    356   output[4] = WRAPLOW(x3, 8);
    357   output[5] = WRAPLOW(-x7, 8);
    358   output[6] = WRAPLOW(x5, 8);
    359   output[7] = WRAPLOW(-x1, 8);
    360 }
    361 
    362 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    363   tran_low_t out[8 * 8] = { 0 };
    364   tran_low_t *outptr = out;
    365   int i, j;
    366   tran_low_t temp_in[8], temp_out[8];
    367 
    368   // First transform rows
    369   // only first 4 row has non-zero coefs
    370   for (i = 0; i < 4; ++i) {
    371     idct8_c(input, outptr);
    372     input += 8;
    373     outptr += 8;
    374   }
    375 
    376   // Then transform columns
    377   for (i = 0; i < 8; ++i) {
    378     for (j = 0; j < 8; ++j)
    379       temp_in[j] = out[j * 8 + i];
    380     idct8_c(temp_in, temp_out);
    381     for (j = 0; j < 8; ++j) {
    382       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    383                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
    384     }
    385   }
    386 }
    387 
    388 void idct16_c(const tran_low_t *input, tran_low_t *output) {
    389   tran_low_t step1[16], step2[16];
    390   tran_high_t temp1, temp2;
    391 
    392   // stage 1
    393   step1[0] = input[0/2];
    394   step1[1] = input[16/2];
    395   step1[2] = input[8/2];
    396   step1[3] = input[24/2];
    397   step1[4] = input[4/2];
    398   step1[5] = input[20/2];
    399   step1[6] = input[12/2];
    400   step1[7] = input[28/2];
    401   step1[8] = input[2/2];
    402   step1[9] = input[18/2];
    403   step1[10] = input[10/2];
    404   step1[11] = input[26/2];
    405   step1[12] = input[6/2];
    406   step1[13] = input[22/2];
    407   step1[14] = input[14/2];
    408   step1[15] = input[30/2];
    409 
    410   // stage 2
    411   step2[0] = step1[0];
    412   step2[1] = step1[1];
    413   step2[2] = step1[2];
    414   step2[3] = step1[3];
    415   step2[4] = step1[4];
    416   step2[5] = step1[5];
    417   step2[6] = step1[6];
    418   step2[7] = step1[7];
    419 
    420   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    421   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    422   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
    423   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
    424 
    425   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    426   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    427   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
    428   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
    429 
    430   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    431   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    432   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
    433   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
    434 
    435   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    436   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    437   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
    438   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
    439 
    440   // stage 3
    441   step1[0] = step2[0];
    442   step1[1] = step2[1];
    443   step1[2] = step2[2];
    444   step1[3] = step2[3];
    445 
    446   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    447   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    448   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
    449   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
    450   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    451   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    452   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
    453   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
    454 
    455   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
    456   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
    457   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
    458   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
    459   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
    460   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
    461   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
    462   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
    463 
    464   // stage 4
    465   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    466   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    467   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
    468   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
    469   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    470   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    471   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
    472   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
    473   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
    474   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
    475   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
    476   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
    477 
    478   step2[8] = step1[8];
    479   step2[15] = step1[15];
    480   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    481   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    482   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
    483   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
    484   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    485   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    486   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
    487   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
    488   step2[11] = step1[11];
    489   step2[12] = step1[12];
    490 
    491   // stage 5
    492   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
    493   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
    494   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
    495   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
    496   step1[4] = step2[4];
    497   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    498   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    499   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
    500   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
    501   step1[7] = step2[7];
    502 
    503   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
    504   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
    505   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
    506   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
    507   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
    508   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
    509   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
    510   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
    511 
    512   // stage 6
    513   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
    514   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
    515   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
    516   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
    517   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
    518   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
    519   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
    520   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
    521   step2[8] = step1[8];
    522   step2[9] = step1[9];
    523   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    524   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    525   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
    526   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
    527   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    528   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    529   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
    530   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
    531   step2[14] = step1[14];
    532   step2[15] = step1[15];
    533 
    534   // stage 7
    535   output[0] = WRAPLOW(step2[0] + step2[15], 8);
    536   output[1] = WRAPLOW(step2[1] + step2[14], 8);
    537   output[2] = WRAPLOW(step2[2] + step2[13], 8);
    538   output[3] = WRAPLOW(step2[3] + step2[12], 8);
    539   output[4] = WRAPLOW(step2[4] + step2[11], 8);
    540   output[5] = WRAPLOW(step2[5] + step2[10], 8);
    541   output[6] = WRAPLOW(step2[6] + step2[9], 8);
    542   output[7] = WRAPLOW(step2[7] + step2[8], 8);
    543   output[8] = WRAPLOW(step2[7] - step2[8], 8);
    544   output[9] = WRAPLOW(step2[6] - step2[9], 8);
    545   output[10] = WRAPLOW(step2[5] - step2[10], 8);
    546   output[11] = WRAPLOW(step2[4] - step2[11], 8);
    547   output[12] = WRAPLOW(step2[3] - step2[12], 8);
    548   output[13] = WRAPLOW(step2[2] - step2[13], 8);
    549   output[14] = WRAPLOW(step2[1] - step2[14], 8);
    550   output[15] = WRAPLOW(step2[0] - step2[15], 8);
    551 }
    552 
    553 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
    554                              int stride) {
    555   tran_low_t out[16 * 16];
    556   tran_low_t *outptr = out;
    557   int i, j;
    558   tran_low_t temp_in[16], temp_out[16];
    559 
    560   // First transform rows
    561   for (i = 0; i < 16; ++i) {
    562     idct16_c(input, outptr);
    563     input += 16;
    564     outptr += 16;
    565   }
    566 
    567   // Then transform columns
    568   for (i = 0; i < 16; ++i) {
    569     for (j = 0; j < 16; ++j)
    570       temp_in[j] = out[j * 16 + i];
    571     idct16_c(temp_in, temp_out);
    572     for (j = 0; j < 16; ++j) {
    573       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    574                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    575     }
    576   }
    577 }
    578 
    579 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
    580   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    581   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    582 
    583   tran_high_t x0 = input[15];
    584   tran_high_t x1 = input[0];
    585   tran_high_t x2 = input[13];
    586   tran_high_t x3 = input[2];
    587   tran_high_t x4 = input[11];
    588   tran_high_t x5 = input[4];
    589   tran_high_t x6 = input[9];
    590   tran_high_t x7 = input[6];
    591   tran_high_t x8 = input[7];
    592   tran_high_t x9 = input[8];
    593   tran_high_t x10 = input[5];
    594   tran_high_t x11 = input[10];
    595   tran_high_t x12 = input[3];
    596   tran_high_t x13 = input[12];
    597   tran_high_t x14 = input[1];
    598   tran_high_t x15 = input[14];
    599 
    600   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
    601            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
    602     output[0] = output[1] = output[2] = output[3] = output[4]
    603               = output[5] = output[6] = output[7] = output[8]
    604               = output[9] = output[10] = output[11] = output[12]
    605               = output[13] = output[14] = output[15] = 0;
    606     return;
    607   }
    608 
    609   // stage 1
    610   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
    611   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    612   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
    613   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    614   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
    615   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    616   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    617   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    618   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    619   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    620   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    621   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    622   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    623   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
    624   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    625   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
    626 
    627   x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
    628   x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
    629   x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
    630   x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
    631   x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
    632   x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
    633   x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
    634   x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
    635   x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
    636   x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
    637   x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
    638   x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
    639   x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
    640   x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
    641   x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
    642   x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
    643 
    644   // stage 2
    645   s0 = x0;
    646   s1 = x1;
    647   s2 = x2;
    648   s3 = x3;
    649   s4 = x4;
    650   s5 = x5;
    651   s6 = x6;
    652   s7 = x7;
    653   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
    654   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
    655   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
    656   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
    657   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
    658   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
    659   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
    660   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
    661 
    662   x0 = WRAPLOW(s0 + s4, 8);
    663   x1 = WRAPLOW(s1 + s5, 8);
    664   x2 = WRAPLOW(s2 + s6, 8);
    665   x3 = WRAPLOW(s3 + s7, 8);
    666   x4 = WRAPLOW(s0 - s4, 8);
    667   x5 = WRAPLOW(s1 - s5, 8);
    668   x6 = WRAPLOW(s2 - s6, 8);
    669   x7 = WRAPLOW(s3 - s7, 8);
    670   x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
    671   x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
    672   x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
    673   x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
    674   x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
    675   x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
    676   x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
    677   x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
    678 
    679   // stage 3
    680   s0 = x0;
    681   s1 = x1;
    682   s2 = x2;
    683   s3 = x3;
    684   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
    685   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    686   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
    687   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
    688   s8 = x8;
    689   s9 = x9;
    690   s10 = x10;
    691   s11 = x11;
    692   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
    693   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    694   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
    695   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
    696 
    697   x0 = WRAPLOW(check_range(s0 + s2), 8);
    698   x1 = WRAPLOW(check_range(s1 + s3), 8);
    699   x2 = WRAPLOW(check_range(s0 - s2), 8);
    700   x3 = WRAPLOW(check_range(s1 - s3), 8);
    701   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
    702   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
    703   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
    704   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
    705   x8 = WRAPLOW(check_range(s8 + s10), 8);
    706   x9 = WRAPLOW(check_range(s9 + s11), 8);
    707   x10 = WRAPLOW(check_range(s8 - s10), 8);
    708   x11 = WRAPLOW(check_range(s9 - s11), 8);
    709   x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
    710   x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
    711   x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
    712   x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
    713 
    714   // stage 4
    715   s2 = (- cospi_16_64) * (x2 + x3);
    716   s3 = cospi_16_64 * (x2 - x3);
    717   s6 = cospi_16_64 * (x6 + x7);
    718   s7 = cospi_16_64 * (- x6 + x7);
    719   s10 = cospi_16_64 * (x10 + x11);
    720   s11 = cospi_16_64 * (- x10 + x11);
    721   s14 = (- cospi_16_64) * (x14 + x15);
    722   s15 = cospi_16_64 * (x14 - x15);
    723 
    724   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
    725   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
    726   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
    727   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
    728   x10 = WRAPLOW(dct_const_round_shift(s10), 8);
    729   x11 = WRAPLOW(dct_const_round_shift(s11), 8);
    730   x14 = WRAPLOW(dct_const_round_shift(s14), 8);
    731   x15 = WRAPLOW(dct_const_round_shift(s15), 8);
    732 
    733   output[0] = WRAPLOW(x0, 8);
    734   output[1] = WRAPLOW(-x8, 8);
    735   output[2] = WRAPLOW(x12, 8);
    736   output[3] = WRAPLOW(-x4, 8);
    737   output[4] = WRAPLOW(x6, 8);
    738   output[5] = WRAPLOW(x14, 8);
    739   output[6] = WRAPLOW(x10, 8);
    740   output[7] = WRAPLOW(x2, 8);
    741   output[8] = WRAPLOW(x3, 8);
    742   output[9] = WRAPLOW(x11, 8);
    743   output[10] = WRAPLOW(x15, 8);
    744   output[11] = WRAPLOW(x7, 8);
    745   output[12] = WRAPLOW(x5, 8);
    746   output[13] = WRAPLOW(-x13, 8);
    747   output[14] = WRAPLOW(x9, 8);
    748   output[15] = WRAPLOW(-x1, 8);
    749 }
    750 
    751 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
    752                             int stride) {
    753   tran_low_t out[16 * 16] = { 0 };
    754   tran_low_t *outptr = out;
    755   int i, j;
    756   tran_low_t temp_in[16], temp_out[16];
    757 
    758   // First transform rows. Since all non-zero dct coefficients are in
    759   // upper-left 4x4 area, we only need to calculate first 4 rows here.
    760   for (i = 0; i < 4; ++i) {
    761     idct16_c(input, outptr);
    762     input += 16;
    763     outptr += 16;
    764   }
    765 
    766   // Then transform columns
    767   for (i = 0; i < 16; ++i) {
    768     for (j = 0; j < 16; ++j)
    769       temp_in[j] = out[j*16 + i];
    770     idct16_c(temp_in, temp_out);
    771     for (j = 0; j < 16; ++j) {
    772       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    773                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    774     }
    775   }
    776 }
    777 
    778 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    779   int i, j;
    780   tran_high_t a1;
    781   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
    782   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
    783   a1 = ROUND_POWER_OF_TWO(out, 6);
    784   for (j = 0; j < 16; ++j) {
    785     for (i = 0; i < 16; ++i)
    786       dest[i] = clip_pixel_add(dest[i], a1);
    787     dest += stride;
    788   }
    789 }
    790 
    791 void idct32_c(const tran_low_t *input, tran_low_t *output) {
    792   tran_low_t step1[32], step2[32];
    793   tran_high_t temp1, temp2;
    794 
    795   // stage 1
    796   step1[0] = input[0];
    797   step1[1] = input[16];
    798   step1[2] = input[8];
    799   step1[3] = input[24];
    800   step1[4] = input[4];
    801   step1[5] = input[20];
    802   step1[6] = input[12];
    803   step1[7] = input[28];
    804   step1[8] = input[2];
    805   step1[9] = input[18];
    806   step1[10] = input[10];
    807   step1[11] = input[26];
    808   step1[12] = input[6];
    809   step1[13] = input[22];
    810   step1[14] = input[14];
    811   step1[15] = input[30];
    812 
    813   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
    814   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
    815   step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
    816   step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
    817 
    818   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
    819   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
    820   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
    821   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
    822 
    823   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
    824   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
    825   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
    826   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
    827 
    828   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
    829   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
    830   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
    831   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
    832 
    833   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
    834   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
    835   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
    836   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
    837 
    838   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
    839   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
    840   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
    841   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
    842 
    843   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
    844   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
    845   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
    846   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
    847 
    848   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
    849   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
    850   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
    851   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
    852 
    853   // stage 2
    854   step2[0] = step1[0];
    855   step2[1] = step1[1];
    856   step2[2] = step1[2];
    857   step2[3] = step1[3];
    858   step2[4] = step1[4];
    859   step2[5] = step1[5];
    860   step2[6] = step1[6];
    861   step2[7] = step1[7];
    862 
    863   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    864   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    865   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
    866   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
    867 
    868   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    869   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    870   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
    871   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
    872 
    873   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    874   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    875   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
    876   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
    877 
    878   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    879   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    880   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
    881   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
    882 
    883   step2[16] = WRAPLOW(step1[16] + step1[17], 8);
    884   step2[17] = WRAPLOW(step1[16] - step1[17], 8);
    885   step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
    886   step2[19] = WRAPLOW(step1[18] + step1[19], 8);
    887   step2[20] = WRAPLOW(step1[20] + step1[21], 8);
    888   step2[21] = WRAPLOW(step1[20] - step1[21], 8);
    889   step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
    890   step2[23] = WRAPLOW(step1[22] + step1[23], 8);
    891   step2[24] = WRAPLOW(step1[24] + step1[25], 8);
    892   step2[25] = WRAPLOW(step1[24] - step1[25], 8);
    893   step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
    894   step2[27] = WRAPLOW(step1[26] + step1[27], 8);
    895   step2[28] = WRAPLOW(step1[28] + step1[29], 8);
    896   step2[29] = WRAPLOW(step1[28] - step1[29], 8);
    897   step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
    898   step2[31] = WRAPLOW(step1[30] + step1[31], 8);
    899 
    900   // stage 3
    901   step1[0] = step2[0];
    902   step1[1] = step2[1];
    903   step1[2] = step2[2];
    904   step1[3] = step2[3];
    905 
    906   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    907   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    908   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
    909   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
    910   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    911   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    912   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
    913   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
    914 
    915   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
    916   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
    917   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
    918   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
    919   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
    920   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
    921   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
    922   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
    923 
    924   step1[16] = step2[16];
    925   step1[31] = step2[31];
    926   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
    927   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
    928   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
    929   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
    930   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
    931   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
    932   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
    933   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
    934   step1[19] = step2[19];
    935   step1[20] = step2[20];
    936   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
    937   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
    938   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
    939   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
    940   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
    941   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
    942   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
    943   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
    944   step1[23] = step2[23];
    945   step1[24] = step2[24];
    946   step1[27] = step2[27];
    947   step1[28] = step2[28];
    948 
    949   // stage 4
    950   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    951   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    952   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
    953   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
    954   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    955   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    956   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
    957   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
    958   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
    959   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
    960   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
    961   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
    962 
    963   step2[8] = step1[8];
    964   step2[15] = step1[15];
    965   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    966   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    967   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
    968   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
    969   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    970   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    971   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
    972   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
    973   step2[11] = step1[11];
    974   step2[12] = step1[12];
    975 
    976   step2[16] = WRAPLOW(step1[16] + step1[19], 8);
    977   step2[17] = WRAPLOW(step1[17] + step1[18], 8);
    978   step2[18] = WRAPLOW(step1[17] - step1[18], 8);
    979   step2[19] = WRAPLOW(step1[16] - step1[19], 8);
    980   step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
    981   step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
    982   step2[22] = WRAPLOW(step1[21] + step1[22], 8);
    983   step2[23] = WRAPLOW(step1[20] + step1[23], 8);
    984 
    985   step2[24] = WRAPLOW(step1[24] + step1[27], 8);
    986   step2[25] = WRAPLOW(step1[25] + step1[26], 8);
    987   step2[26] = WRAPLOW(step1[25] - step1[26], 8);
    988   step2[27] = WRAPLOW(step1[24] - step1[27], 8);
    989   step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
    990   step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
    991   step2[30] = WRAPLOW(step1[29] + step1[30], 8);
    992   step2[31] = WRAPLOW(step1[28] + step1[31], 8);
    993 
    994   // stage 5
    995   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
    996   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
    997   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
    998   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
    999   step1[4] = step2[4];
   1000   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1001   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1002   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1003   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1004   step1[7] = step2[7];
   1005 
   1006   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
   1007   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
   1008   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
   1009   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
   1010   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
   1011   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
   1012   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
   1013   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
   1014 
   1015   step1[16] = step2[16];
   1016   step1[17] = step2[17];
   1017   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   1018   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   1019   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1020   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1021   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   1022   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   1023   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1024   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1025   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   1026   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   1027   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1028   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1029   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   1030   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   1031   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1032   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1033   step1[22] = step2[22];
   1034   step1[23] = step2[23];
   1035   step1[24] = step2[24];
   1036   step1[25] = step2[25];
   1037   step1[30] = step2[30];
   1038   step1[31] = step2[31];
   1039 
   1040   // stage 6
   1041   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
   1042   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
   1043   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
   1044   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
   1045   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
   1046   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
   1047   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
   1048   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
   1049   step2[8] = step1[8];
   1050   step2[9] = step1[9];
   1051   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   1052   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   1053   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1054   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1055   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   1056   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   1057   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1058   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1059   step2[14] = step1[14];
   1060   step2[15] = step1[15];
   1061 
   1062   step2[16] = WRAPLOW(step1[16] + step1[23], 8);
   1063   step2[17] = WRAPLOW(step1[17] + step1[22], 8);
   1064   step2[18] = WRAPLOW(step1[18] + step1[21], 8);
   1065   step2[19] = WRAPLOW(step1[19] + step1[20], 8);
   1066   step2[20] = WRAPLOW(step1[19] - step1[20], 8);
   1067   step2[21] = WRAPLOW(step1[18] - step1[21], 8);
   1068   step2[22] = WRAPLOW(step1[17] - step1[22], 8);
   1069   step2[23] = WRAPLOW(step1[16] - step1[23], 8);
   1070 
   1071   step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
   1072   step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
   1073   step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
   1074   step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
   1075   step2[28] = WRAPLOW(step1[27] + step1[28], 8);
   1076   step2[29] = WRAPLOW(step1[26] + step1[29], 8);
   1077   step2[30] = WRAPLOW(step1[25] + step1[30], 8);
   1078   step2[31] = WRAPLOW(step1[24] + step1[31], 8);
   1079 
   1080   // stage 7
   1081   step1[0] = WRAPLOW(step2[0] + step2[15], 8);
   1082   step1[1] = WRAPLOW(step2[1] + step2[14], 8);
   1083   step1[2] = WRAPLOW(step2[2] + step2[13], 8);
   1084   step1[3] = WRAPLOW(step2[3] + step2[12], 8);
   1085   step1[4] = WRAPLOW(step2[4] + step2[11], 8);
   1086   step1[5] = WRAPLOW(step2[5] + step2[10], 8);
   1087   step1[6] = WRAPLOW(step2[6] + step2[9], 8);
   1088   step1[7] = WRAPLOW(step2[7] + step2[8], 8);
   1089   step1[8] = WRAPLOW(step2[7] - step2[8], 8);
   1090   step1[9] = WRAPLOW(step2[6] - step2[9], 8);
   1091   step1[10] = WRAPLOW(step2[5] - step2[10], 8);
   1092   step1[11] = WRAPLOW(step2[4] - step2[11], 8);
   1093   step1[12] = WRAPLOW(step2[3] - step2[12], 8);
   1094   step1[13] = WRAPLOW(step2[2] - step2[13], 8);
   1095   step1[14] = WRAPLOW(step2[1] - step2[14], 8);
   1096   step1[15] = WRAPLOW(step2[0] - step2[15], 8);
   1097 
   1098   step1[16] = step2[16];
   1099   step1[17] = step2[17];
   1100   step1[18] = step2[18];
   1101   step1[19] = step2[19];
   1102   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   1103   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   1104   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1105   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1106   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   1107   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   1108   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1109   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1110   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   1111   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   1112   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1113   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1114   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   1115   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   1116   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
   1117   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
   1118   step1[28] = step2[28];
   1119   step1[29] = step2[29];
   1120   step1[30] = step2[30];
   1121   step1[31] = step2[31];
   1122 
   1123   // final stage
   1124   output[0] = WRAPLOW(step1[0] + step1[31], 8);
   1125   output[1] = WRAPLOW(step1[1] + step1[30], 8);
   1126   output[2] = WRAPLOW(step1[2] + step1[29], 8);
   1127   output[3] = WRAPLOW(step1[3] + step1[28], 8);
   1128   output[4] = WRAPLOW(step1[4] + step1[27], 8);
   1129   output[5] = WRAPLOW(step1[5] + step1[26], 8);
   1130   output[6] = WRAPLOW(step1[6] + step1[25], 8);
   1131   output[7] = WRAPLOW(step1[7] + step1[24], 8);
   1132   output[8] = WRAPLOW(step1[8] + step1[23], 8);
   1133   output[9] = WRAPLOW(step1[9] + step1[22], 8);
   1134   output[10] = WRAPLOW(step1[10] + step1[21], 8);
   1135   output[11] = WRAPLOW(step1[11] + step1[20], 8);
   1136   output[12] = WRAPLOW(step1[12] + step1[19], 8);
   1137   output[13] = WRAPLOW(step1[13] + step1[18], 8);
   1138   output[14] = WRAPLOW(step1[14] + step1[17], 8);
   1139   output[15] = WRAPLOW(step1[15] + step1[16], 8);
   1140   output[16] = WRAPLOW(step1[15] - step1[16], 8);
   1141   output[17] = WRAPLOW(step1[14] - step1[17], 8);
   1142   output[18] = WRAPLOW(step1[13] - step1[18], 8);
   1143   output[19] = WRAPLOW(step1[12] - step1[19], 8);
   1144   output[20] = WRAPLOW(step1[11] - step1[20], 8);
   1145   output[21] = WRAPLOW(step1[10] - step1[21], 8);
   1146   output[22] = WRAPLOW(step1[9] - step1[22], 8);
   1147   output[23] = WRAPLOW(step1[8] - step1[23], 8);
   1148   output[24] = WRAPLOW(step1[7] - step1[24], 8);
   1149   output[25] = WRAPLOW(step1[6] - step1[25], 8);
   1150   output[26] = WRAPLOW(step1[5] - step1[26], 8);
   1151   output[27] = WRAPLOW(step1[4] - step1[27], 8);
   1152   output[28] = WRAPLOW(step1[3] - step1[28], 8);
   1153   output[29] = WRAPLOW(step1[2] - step1[29], 8);
   1154   output[30] = WRAPLOW(step1[1] - step1[30], 8);
   1155   output[31] = WRAPLOW(step1[0] - step1[31], 8);
   1156 }
   1157 
   1158 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
   1159                               int stride) {
   1160   tran_low_t out[32 * 32];
   1161   tran_low_t *outptr = out;
   1162   int i, j;
   1163   tran_low_t temp_in[32], temp_out[32];
   1164 
   1165   // Rows
   1166   for (i = 0; i < 32; ++i) {
   1167     int16_t zero_coeff[16];
   1168     for (j = 0; j < 16; ++j)
   1169       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
   1170     for (j = 0; j < 8; ++j)
   1171       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   1172     for (j = 0; j < 4; ++j)
   1173       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   1174     for (j = 0; j < 2; ++j)
   1175       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   1176 
   1177     if (zero_coeff[0] | zero_coeff[1])
   1178       idct32_c(input, outptr);
   1179     else
   1180       memset(outptr, 0, sizeof(tran_low_t) * 32);
   1181     input += 32;
   1182     outptr += 32;
   1183   }
   1184 
   1185   // Columns
   1186   for (i = 0; i < 32; ++i) {
   1187     for (j = 0; j < 32; ++j)
   1188       temp_in[j] = out[j * 32 + i];
   1189     idct32_c(temp_in, temp_out);
   1190     for (j = 0; j < 32; ++j) {
   1191       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1192                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1193     }
   1194   }
   1195 }
   1196 
   1197 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
   1198                             int stride) {
   1199   tran_low_t out[32 * 32] = {0};
   1200   tran_low_t *outptr = out;
   1201   int i, j;
   1202   tran_low_t temp_in[32], temp_out[32];
   1203 
   1204   // Rows
   1205   // only upper-left 8x8 has non-zero coeff
   1206   for (i = 0; i < 8; ++i) {
   1207     idct32_c(input, outptr);
   1208     input += 32;
   1209     outptr += 32;
   1210   }
   1211 
   1212   // Columns
   1213   for (i = 0; i < 32; ++i) {
   1214     for (j = 0; j < 32; ++j)
   1215       temp_in[j] = out[j * 32 + i];
   1216     idct32_c(temp_in, temp_out);
   1217     for (j = 0; j < 32; ++j) {
   1218       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1219                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1220     }
   1221   }
   1222 }
   1223 
   1224 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   1225   int i, j;
   1226   tran_high_t a1;
   1227 
   1228   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
   1229   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
   1230   a1 = ROUND_POWER_OF_TWO(out, 6);
   1231 
   1232   for (j = 0; j < 32; ++j) {
   1233     for (i = 0; i < 32; ++i)
   1234       dest[i] = clip_pixel_add(dest[i], a1);
   1235     dest += stride;
   1236   }
   1237 }
   1238 
   1239 #if CONFIG_VP9_HIGHBITDEPTH
   1240 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   1241                                  int stride, int bd) {
   1242   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
   1243      0.5 shifts per pixel. */
   1244   int i;
   1245   tran_low_t output[16];
   1246   tran_high_t a1, b1, c1, d1, e1;
   1247   const tran_low_t *ip = input;
   1248   tran_low_t *op = output;
   1249   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1250 
   1251   for (i = 0; i < 4; i++) {
   1252     a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1253     c1 = ip[1] >> UNIT_QUANT_SHIFT;
   1254     d1 = ip[2] >> UNIT_QUANT_SHIFT;
   1255     b1 = ip[3] >> UNIT_QUANT_SHIFT;
   1256     a1 += c1;
   1257     d1 -= b1;
   1258     e1 = (a1 - d1) >> 1;
   1259     b1 = e1 - b1;
   1260     c1 = e1 - c1;
   1261     a1 -= b1;
   1262     d1 += c1;
   1263     op[0] = WRAPLOW(a1, bd);
   1264     op[1] = WRAPLOW(b1, bd);
   1265     op[2] = WRAPLOW(c1, bd);
   1266     op[3] = WRAPLOW(d1, bd);
   1267     ip += 4;
   1268     op += 4;
   1269   }
   1270 
   1271   ip = output;
   1272   for (i = 0; i < 4; i++) {
   1273     a1 = ip[4 * 0];
   1274     c1 = ip[4 * 1];
   1275     d1 = ip[4 * 2];
   1276     b1 = ip[4 * 3];
   1277     a1 += c1;
   1278     d1 -= b1;
   1279     e1 = (a1 - d1) >> 1;
   1280     b1 = e1 - b1;
   1281     c1 = e1 - c1;
   1282     a1 -= b1;
   1283     d1 += c1;
   1284     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
   1285     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
   1286     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
   1287     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
   1288 
   1289     ip++;
   1290     dest++;
   1291   }
   1292 }
   1293 
   1294 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   1295                                 int dest_stride, int bd) {
   1296   int i;
   1297   tran_high_t a1, e1;
   1298   tran_low_t tmp[4];
   1299   const tran_low_t *ip = in;
   1300   tran_low_t *op = tmp;
   1301   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1302   (void) bd;
   1303 
   1304   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1305   e1 = a1 >> 1;
   1306   a1 -= e1;
   1307   op[0] = WRAPLOW(a1, bd);
   1308   op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
   1309 
   1310   ip = tmp;
   1311   for (i = 0; i < 4; i++) {
   1312     e1 = ip[0] >> 1;
   1313     a1 = ip[0] - e1;
   1314     dest[dest_stride * 0] = highbd_clip_pixel_add(
   1315         dest[dest_stride * 0], a1, bd);
   1316     dest[dest_stride * 1] = highbd_clip_pixel_add(
   1317         dest[dest_stride * 1], e1, bd);
   1318     dest[dest_stride * 2] = highbd_clip_pixel_add(
   1319         dest[dest_stride * 2], e1, bd);
   1320     dest[dest_stride * 3] = highbd_clip_pixel_add(
   1321         dest[dest_stride * 3], e1, bd);
   1322     ip++;
   1323     dest++;
   1324   }
   1325 }
   1326 
   1327 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1328   tran_low_t step[4];
   1329   tran_high_t temp1, temp2;
   1330   (void) bd;
   1331   // stage 1
   1332   temp1 = (input[0] + input[2]) * cospi_16_64;
   1333   temp2 = (input[0] - input[2]) * cospi_16_64;
   1334   step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1335   step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1336   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   1337   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
   1338   step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1339   step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1340 
   1341   // stage 2
   1342   output[0] = WRAPLOW(step[0] + step[3], bd);
   1343   output[1] = WRAPLOW(step[1] + step[2], bd);
   1344   output[2] = WRAPLOW(step[1] - step[2], bd);
   1345   output[3] = WRAPLOW(step[0] - step[3], bd);
   1346 }
   1347 
   1348 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   1349                                  int stride, int bd) {
   1350   tran_low_t out[4 * 4];
   1351   tran_low_t *outptr = out;
   1352   int i, j;
   1353   tran_low_t temp_in[4], temp_out[4];
   1354   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1355 
   1356   // Rows
   1357   for (i = 0; i < 4; ++i) {
   1358     vpx_highbd_idct4_c(input, outptr, bd);
   1359     input += 4;
   1360     outptr += 4;
   1361   }
   1362 
   1363   // Columns
   1364   for (i = 0; i < 4; ++i) {
   1365     for (j = 0; j < 4; ++j)
   1366       temp_in[j] = out[j * 4 + i];
   1367     vpx_highbd_idct4_c(temp_in, temp_out, bd);
   1368     for (j = 0; j < 4; ++j) {
   1369       dest[j * stride + i] = highbd_clip_pixel_add(
   1370           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
   1371     }
   1372   }
   1373 }
   1374 
   1375 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
   1376                                 int dest_stride, int bd) {
   1377   int i;
   1378   tran_high_t a1;
   1379   tran_low_t out = WRAPLOW(
   1380       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   1381   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1382 
   1383   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   1384   a1 = ROUND_POWER_OF_TWO(out, 4);
   1385 
   1386   for (i = 0; i < 4; i++) {
   1387     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
   1388     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
   1389     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
   1390     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
   1391     dest += dest_stride;
   1392   }
   1393 }
   1394 
   1395 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1396   tran_low_t step1[8], step2[8];
   1397   tran_high_t temp1, temp2;
   1398   // stage 1
   1399   step1[0] = input[0];
   1400   step1[2] = input[4];
   1401   step1[1] = input[2];
   1402   step1[3] = input[6];
   1403   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   1404   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
   1405   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1406   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1407   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   1408   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
   1409   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1410   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1411 
   1412   // stage 2 & stage 3 - even half
   1413   vpx_highbd_idct4_c(step1, step1, bd);
   1414 
   1415   // stage 2 - odd half
   1416   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   1417   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   1418   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
   1419   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
   1420 
   1421   // stage 3 - odd half
   1422   step1[4] = step2[4];
   1423   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1424   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1425   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1426   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1427   step1[7] = step2[7];
   1428 
   1429   // stage 4
   1430   output[0] = WRAPLOW(step1[0] + step1[7], bd);
   1431   output[1] = WRAPLOW(step1[1] + step1[6], bd);
   1432   output[2] = WRAPLOW(step1[2] + step1[5], bd);
   1433   output[3] = WRAPLOW(step1[3] + step1[4], bd);
   1434   output[4] = WRAPLOW(step1[3] - step1[4], bd);
   1435   output[5] = WRAPLOW(step1[2] - step1[5], bd);
   1436   output[6] = WRAPLOW(step1[1] - step1[6], bd);
   1437   output[7] = WRAPLOW(step1[0] - step1[7], bd);
   1438 }
   1439 
   1440 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
   1441                                  int stride, int bd) {
   1442   tran_low_t out[8 * 8];
   1443   tran_low_t *outptr = out;
   1444   int i, j;
   1445   tran_low_t temp_in[8], temp_out[8];
   1446   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1447 
   1448   // First transform rows.
   1449   for (i = 0; i < 8; ++i) {
   1450     vpx_highbd_idct8_c(input, outptr, bd);
   1451     input += 8;
   1452     outptr += 8;
   1453   }
   1454 
   1455   // Then transform columns.
   1456   for (i = 0; i < 8; ++i) {
   1457     for (j = 0; j < 8; ++j)
   1458       temp_in[j] = out[j * 8 + i];
   1459     vpx_highbd_idct8_c(temp_in, temp_out, bd);
   1460     for (j = 0; j < 8; ++j) {
   1461       dest[j * stride + i] = highbd_clip_pixel_add(
   1462           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1463     }
   1464   }
   1465 }
   1466 
   1467 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
   1468                                 int stride, int bd) {
   1469   int i, j;
   1470   tran_high_t a1;
   1471   tran_low_t out = WRAPLOW(
   1472       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   1473   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1474   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   1475   a1 = ROUND_POWER_OF_TWO(out, 5);
   1476   for (j = 0; j < 8; ++j) {
   1477     for (i = 0; i < 8; ++i)
   1478       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   1479     dest += stride;
   1480   }
   1481 }
   1482 
   1483 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1484   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1485 
   1486   tran_low_t x0 = input[0];
   1487   tran_low_t x1 = input[1];
   1488   tran_low_t x2 = input[2];
   1489   tran_low_t x3 = input[3];
   1490   (void) bd;
   1491 
   1492   if (!(x0 | x1 | x2 | x3)) {
   1493     memset(output, 0, 4 * sizeof(*output));
   1494     return;
   1495   }
   1496 
   1497   s0 = sinpi_1_9 * x0;
   1498   s1 = sinpi_2_9 * x0;
   1499   s2 = sinpi_3_9 * x1;
   1500   s3 = sinpi_4_9 * x2;
   1501   s4 = sinpi_1_9 * x2;
   1502   s5 = sinpi_2_9 * x3;
   1503   s6 = sinpi_4_9 * x3;
   1504   s7 = (tran_high_t)(x0 - x2 + x3);
   1505 
   1506   s0 = s0 + s3 + s5;
   1507   s1 = s1 - s4 - s6;
   1508   s3 = s2;
   1509   s2 = sinpi_3_9 * s7;
   1510 
   1511   // 1-D transform scaling factor is sqrt(2).
   1512   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   1513   // + 1b (addition) = 29b.
   1514   // Hence the output bit depth is 15b.
   1515   output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
   1516   output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
   1517   output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
   1518   output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
   1519 }
   1520 
   1521 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1522   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1523 
   1524   tran_low_t x0 = input[7];
   1525   tran_low_t x1 = input[0];
   1526   tran_low_t x2 = input[5];
   1527   tran_low_t x3 = input[2];
   1528   tran_low_t x4 = input[3];
   1529   tran_low_t x5 = input[4];
   1530   tran_low_t x6 = input[1];
   1531   tran_low_t x7 = input[6];
   1532   (void) bd;
   1533 
   1534   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
   1535     memset(output, 0, 8 * sizeof(*output));
   1536     return;
   1537   }
   1538 
   1539   // stage 1
   1540   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
   1541   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
   1542   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   1543   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   1544   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   1545   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   1546   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   1547   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
   1548 
   1549   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
   1550   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
   1551   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
   1552   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
   1553   x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
   1554   x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
   1555   x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
   1556   x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
   1557 
   1558   // stage 2
   1559   s0 = x0;
   1560   s1 = x1;
   1561   s2 = x2;
   1562   s3 = x3;
   1563   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
   1564   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
   1565   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   1566   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
   1567 
   1568   x0 = WRAPLOW(s0 + s2, bd);
   1569   x1 = WRAPLOW(s1 + s3, bd);
   1570   x2 = WRAPLOW(s0 - s2, bd);
   1571   x3 = WRAPLOW(s1 - s3, bd);
   1572   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
   1573   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
   1574   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
   1575   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
   1576 
   1577   // stage 3
   1578   s2 = cospi_16_64 * (x2 + x3);
   1579   s3 = cospi_16_64 * (x2 - x3);
   1580   s6 = cospi_16_64 * (x6 + x7);
   1581   s7 = cospi_16_64 * (x6 - x7);
   1582 
   1583   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
   1584   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
   1585   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
   1586   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
   1587 
   1588   output[0] = WRAPLOW(x0, bd);
   1589   output[1] = WRAPLOW(-x4, bd);
   1590   output[2] = WRAPLOW(x6, bd);
   1591   output[3] = WRAPLOW(-x2, bd);
   1592   output[4] = WRAPLOW(x3, bd);
   1593   output[5] = WRAPLOW(-x7, bd);
   1594   output[6] = WRAPLOW(x5, bd);
   1595   output[7] = WRAPLOW(-x1, bd);
   1596 }
   1597 
   1598 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
   1599                                  int stride, int bd) {
   1600   tran_low_t out[8 * 8] = { 0 };
   1601   tran_low_t *outptr = out;
   1602   int i, j;
   1603   tran_low_t temp_in[8], temp_out[8];
   1604   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1605 
   1606   // First transform rows.
   1607   // Only first 4 row has non-zero coefs.
   1608   for (i = 0; i < 4; ++i) {
   1609     vpx_highbd_idct8_c(input, outptr, bd);
   1610     input += 8;
   1611     outptr += 8;
   1612   }
   1613   // Then transform columns.
   1614   for (i = 0; i < 8; ++i) {
   1615     for (j = 0; j < 8; ++j)
   1616       temp_in[j] = out[j * 8 + i];
   1617     vpx_highbd_idct8_c(temp_in, temp_out, bd);
   1618     for (j = 0; j < 8; ++j) {
   1619       dest[j * stride + i] = highbd_clip_pixel_add(
   1620           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1621     }
   1622   }
   1623 }
   1624 
   1625 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1626   tran_low_t step1[16], step2[16];
   1627   tran_high_t temp1, temp2;
   1628   (void) bd;
   1629 
   1630   // stage 1
   1631   step1[0] = input[0/2];
   1632   step1[1] = input[16/2];
   1633   step1[2] = input[8/2];
   1634   step1[3] = input[24/2];
   1635   step1[4] = input[4/2];
   1636   step1[5] = input[20/2];
   1637   step1[6] = input[12/2];
   1638   step1[7] = input[28/2];
   1639   step1[8] = input[2/2];
   1640   step1[9] = input[18/2];
   1641   step1[10] = input[10/2];
   1642   step1[11] = input[26/2];
   1643   step1[12] = input[6/2];
   1644   step1[13] = input[22/2];
   1645   step1[14] = input[14/2];
   1646   step1[15] = input[30/2];
   1647 
   1648   // stage 2
   1649   step2[0] = step1[0];
   1650   step2[1] = step1[1];
   1651   step2[2] = step1[2];
   1652   step2[3] = step1[3];
   1653   step2[4] = step1[4];
   1654   step2[5] = step1[5];
   1655   step2[6] = step1[6];
   1656   step2[7] = step1[7];
   1657 
   1658   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   1659   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   1660   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1661   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1662 
   1663   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   1664   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   1665   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1666   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1667 
   1668   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   1669   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   1670   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1671   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1672 
   1673   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   1674   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   1675   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1676   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1677 
   1678   // stage 3
   1679   step1[0] = step2[0];
   1680   step1[1] = step2[1];
   1681   step1[2] = step2[2];
   1682   step1[3] = step2[3];
   1683 
   1684   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   1685   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   1686   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1687   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1688   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   1689   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   1690   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1691   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1692 
   1693   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
   1694   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
   1695   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
   1696   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
   1697   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
   1698   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
   1699   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
   1700   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
   1701 
   1702   // stage 4
   1703   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   1704   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   1705   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1706   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1707   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   1708   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   1709   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1710   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1711   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   1712   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   1713   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
   1714   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
   1715 
   1716   step2[8] = step1[8];
   1717   step2[15] = step1[15];
   1718   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   1719   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   1720   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1721   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1722   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   1723   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   1724   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1725   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1726   step2[11] = step1[11];
   1727   step2[12] = step1[12];
   1728 
   1729   // stage 5
   1730   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
   1731   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
   1732   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
   1733   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
   1734   step1[4] = step2[4];
   1735   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1736   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1737   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1738   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1739   step1[7] = step2[7];
   1740 
   1741   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
   1742   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
   1743   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
   1744   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
   1745   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
   1746   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
   1747   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
   1748   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
   1749 
   1750   // stage 6
   1751   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
   1752   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
   1753   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
   1754   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
   1755   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
   1756   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
   1757   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
   1758   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
   1759   step2[8] = step1[8];
   1760   step2[9] = step1[9];
   1761   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   1762   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   1763   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1764   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1765   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   1766   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   1767   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   1768   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   1769   step2[14] = step1[14];
   1770   step2[15] = step1[15];
   1771 
   1772   // stage 7
   1773   output[0] = WRAPLOW(step2[0] + step2[15], bd);
   1774   output[1] = WRAPLOW(step2[1] + step2[14], bd);
   1775   output[2] = WRAPLOW(step2[2] + step2[13], bd);
   1776   output[3] = WRAPLOW(step2[3] + step2[12], bd);
   1777   output[4] = WRAPLOW(step2[4] + step2[11], bd);
   1778   output[5] = WRAPLOW(step2[5] + step2[10], bd);
   1779   output[6] = WRAPLOW(step2[6] + step2[9], bd);
   1780   output[7] = WRAPLOW(step2[7] + step2[8], bd);
   1781   output[8] = WRAPLOW(step2[7] - step2[8], bd);
   1782   output[9] = WRAPLOW(step2[6] - step2[9], bd);
   1783   output[10] = WRAPLOW(step2[5] - step2[10], bd);
   1784   output[11] = WRAPLOW(step2[4] - step2[11], bd);
   1785   output[12] = WRAPLOW(step2[3] - step2[12], bd);
   1786   output[13] = WRAPLOW(step2[2] - step2[13], bd);
   1787   output[14] = WRAPLOW(step2[1] - step2[14], bd);
   1788   output[15] = WRAPLOW(step2[0] - step2[15], bd);
   1789 }
   1790 
   1791 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   1792                                     int stride, int bd) {
   1793   tran_low_t out[16 * 16];
   1794   tran_low_t *outptr = out;
   1795   int i, j;
   1796   tran_low_t temp_in[16], temp_out[16];
   1797   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1798 
   1799   // First transform rows.
   1800   for (i = 0; i < 16; ++i) {
   1801     vpx_highbd_idct16_c(input, outptr, bd);
   1802     input += 16;
   1803     outptr += 16;
   1804   }
   1805 
   1806   // Then transform columns.
   1807   for (i = 0; i < 16; ++i) {
   1808     for (j = 0; j < 16; ++j)
   1809       temp_in[j] = out[j * 16 + i];
   1810     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   1811     for (j = 0; j < 16; ++j) {
   1812       dest[j * stride + i] = highbd_clip_pixel_add(
   1813           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   1814     }
   1815   }
   1816 }
   1817 
   1818 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1819   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   1820   tran_high_t s9, s10, s11, s12, s13, s14, s15;
   1821 
   1822   tran_low_t x0 = input[15];
   1823   tran_low_t x1 = input[0];
   1824   tran_low_t x2 = input[13];
   1825   tran_low_t x3 = input[2];
   1826   tran_low_t x4 = input[11];
   1827   tran_low_t x5 = input[4];
   1828   tran_low_t x6 = input[9];
   1829   tran_low_t x7 = input[6];
   1830   tran_low_t x8 = input[7];
   1831   tran_low_t x9 = input[8];
   1832   tran_low_t x10 = input[5];
   1833   tran_low_t x11 = input[10];
   1834   tran_low_t x12 = input[3];
   1835   tran_low_t x13 = input[12];
   1836   tran_low_t x14 = input[1];
   1837   tran_low_t x15 = input[14];
   1838   (void) bd;
   1839 
   1840   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
   1841            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
   1842     memset(output, 0, 16 * sizeof(*output));
   1843     return;
   1844   }
   1845 
   1846   // stage 1
   1847   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
   1848   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   1849   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
   1850   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   1851   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
   1852   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   1853   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   1854   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   1855   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   1856   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   1857   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   1858   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   1859   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   1860   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
   1861   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   1862   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
   1863 
   1864   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
   1865   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
   1866   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
   1867   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
   1868   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
   1869   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
   1870   x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
   1871   x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
   1872   x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
   1873   x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
   1874   x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
   1875   x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
   1876   x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
   1877   x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
   1878   x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
   1879   x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
   1880 
   1881   // stage 2
   1882   s0 = x0;
   1883   s1 = x1;
   1884   s2 = x2;
   1885   s3 = x3;
   1886   s4 = x4;
   1887   s5 = x5;
   1888   s6 = x6;
   1889   s7 = x7;
   1890   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
   1891   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
   1892   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
   1893   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
   1894   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
   1895   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
   1896   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   1897   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
   1898 
   1899   x0 = WRAPLOW(s0 + s4, bd);
   1900   x1 = WRAPLOW(s1 + s5, bd);
   1901   x2 = WRAPLOW(s2 + s6, bd);
   1902   x3 = WRAPLOW(s3 + s7, bd);
   1903   x4 = WRAPLOW(s0 - s4, bd);
   1904   x5 = WRAPLOW(s1 - s5, bd);
   1905   x6 = WRAPLOW(s2 - s6, bd);
   1906   x7 = WRAPLOW(s3 - s7, bd);
   1907   x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
   1908   x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
   1909   x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
   1910   x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
   1911   x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
   1912   x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
   1913   x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
   1914   x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
   1915 
   1916   // stage 3
   1917   s0 = x0;
   1918   s1 = x1;
   1919   s2 = x2;
   1920   s3 = x3;
   1921   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
   1922   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   1923   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
   1924   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
   1925   s8 = x8;
   1926   s9 = x9;
   1927   s10 = x10;
   1928   s11 = x11;
   1929   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
   1930   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   1931   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   1932   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
   1933 
   1934   x0 = WRAPLOW(s0 + s2, bd);
   1935   x1 = WRAPLOW(s1 + s3, bd);
   1936   x2 = WRAPLOW(s0 - s2, bd);
   1937   x3 = WRAPLOW(s1 - s3, bd);
   1938   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
   1939   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
   1940   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
   1941   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
   1942   x8 = WRAPLOW(s8 + s10, bd);
   1943   x9 = WRAPLOW(s9 + s11, bd);
   1944   x10 = WRAPLOW(s8 - s10, bd);
   1945   x11 = WRAPLOW(s9 - s11, bd);
   1946   x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
   1947   x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
   1948   x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
   1949   x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
   1950 
   1951   // stage 4
   1952   s2 = (- cospi_16_64) * (x2 + x3);
   1953   s3 = cospi_16_64 * (x2 - x3);
   1954   s6 = cospi_16_64 * (x6 + x7);
   1955   s7 = cospi_16_64 * (-x6 + x7);
   1956   s10 = cospi_16_64 * (x10 + x11);
   1957   s11 = cospi_16_64 * (-x10 + x11);
   1958   s14 = (- cospi_16_64) * (x14 + x15);
   1959   s15 = cospi_16_64 * (x14 - x15);
   1960 
   1961   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
   1962   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
   1963   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
   1964   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
   1965   x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
   1966   x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
   1967   x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
   1968   x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
   1969 
   1970   output[0] = WRAPLOW(x0, bd);
   1971   output[1] = WRAPLOW(-x8, bd);
   1972   output[2] = WRAPLOW(x12, bd);
   1973   output[3] = WRAPLOW(-x4, bd);
   1974   output[4] = WRAPLOW(x6, bd);
   1975   output[5] = WRAPLOW(x14, bd);
   1976   output[6] = WRAPLOW(x10, bd);
   1977   output[7] = WRAPLOW(x2, bd);
   1978   output[8] = WRAPLOW(x3, bd);
   1979   output[9] = WRAPLOW(x11, bd);
   1980   output[10] = WRAPLOW(x15, bd);
   1981   output[11] = WRAPLOW(x7, bd);
   1982   output[12] = WRAPLOW(x5, bd);
   1983   output[13] = WRAPLOW(-x13, bd);
   1984   output[14] = WRAPLOW(x9, bd);
   1985   output[15] = WRAPLOW(-x1, bd);
   1986 }
   1987 
   1988 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
   1989                                    int stride, int bd) {
   1990   tran_low_t out[16 * 16] = { 0 };
   1991   tran_low_t *outptr = out;
   1992   int i, j;
   1993   tran_low_t temp_in[16], temp_out[16];
   1994   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   1995 
   1996   // First transform rows. Since all non-zero dct coefficients are in
   1997   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   1998   for (i = 0; i < 4; ++i) {
   1999     vpx_highbd_idct16_c(input, outptr, bd);
   2000     input += 16;
   2001     outptr += 16;
   2002   }
   2003 
   2004   // Then transform columns.
   2005   for (i = 0; i < 16; ++i) {
   2006     for (j = 0; j < 16; ++j)
   2007       temp_in[j] = out[j*16 + i];
   2008     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2009     for (j = 0; j < 16; ++j) {
   2010       dest[j * stride + i] = highbd_clip_pixel_add(
   2011           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2012     }
   2013   }
   2014 }
   2015 
   2016 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
   2017                                   int stride, int bd) {
   2018   int i, j;
   2019   tran_high_t a1;
   2020   tran_low_t out = WRAPLOW(
   2021       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   2022   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2023 
   2024   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   2025   a1 = ROUND_POWER_OF_TWO(out, 6);
   2026   for (j = 0; j < 16; ++j) {
   2027     for (i = 0; i < 16; ++i)
   2028       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   2029     dest += stride;
   2030   }
   2031 }
   2032 
   2033 static void highbd_idct32_c(const tran_low_t *input,
   2034                             tran_low_t *output, int bd) {
   2035   tran_low_t step1[32], step2[32];
   2036   tran_high_t temp1, temp2;
   2037   (void) bd;
   2038 
   2039   // stage 1
   2040   step1[0] = input[0];
   2041   step1[1] = input[16];
   2042   step1[2] = input[8];
   2043   step1[3] = input[24];
   2044   step1[4] = input[4];
   2045   step1[5] = input[20];
   2046   step1[6] = input[12];
   2047   step1[7] = input[28];
   2048   step1[8] = input[2];
   2049   step1[9] = input[18];
   2050   step1[10] = input[10];
   2051   step1[11] = input[26];
   2052   step1[12] = input[6];
   2053   step1[13] = input[22];
   2054   step1[14] = input[14];
   2055   step1[15] = input[30];
   2056 
   2057   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   2058   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
   2059   step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2060   step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2061 
   2062   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   2063   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
   2064   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2065   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2066 
   2067   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   2068   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
   2069   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2070   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2071 
   2072   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   2073   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
   2074   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2075   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2076 
   2077   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   2078   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
   2079   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2080   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2081 
   2082   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   2083   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
   2084   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2085   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2086 
   2087   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   2088   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
   2089   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2090   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2091 
   2092   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   2093   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
   2094   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2095   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2096 
   2097   // stage 2
   2098   step2[0] = step1[0];
   2099   step2[1] = step1[1];
   2100   step2[2] = step1[2];
   2101   step2[3] = step1[3];
   2102   step2[4] = step1[4];
   2103   step2[5] = step1[5];
   2104   step2[6] = step1[6];
   2105   step2[7] = step1[7];
   2106 
   2107   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   2108   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   2109   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2110   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2111 
   2112   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   2113   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   2114   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2115   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2116 
   2117   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   2118   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   2119   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2120   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2121 
   2122   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   2123   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   2124   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2125   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2126 
   2127   step2[16] = WRAPLOW(step1[16] + step1[17], bd);
   2128   step2[17] = WRAPLOW(step1[16] - step1[17], bd);
   2129   step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
   2130   step2[19] = WRAPLOW(step1[18] + step1[19], bd);
   2131   step2[20] = WRAPLOW(step1[20] + step1[21], bd);
   2132   step2[21] = WRAPLOW(step1[20] - step1[21], bd);
   2133   step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
   2134   step2[23] = WRAPLOW(step1[22] + step1[23], bd);
   2135   step2[24] = WRAPLOW(step1[24] + step1[25], bd);
   2136   step2[25] = WRAPLOW(step1[24] - step1[25], bd);
   2137   step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
   2138   step2[27] = WRAPLOW(step1[26] + step1[27], bd);
   2139   step2[28] = WRAPLOW(step1[28] + step1[29], bd);
   2140   step2[29] = WRAPLOW(step1[28] - step1[29], bd);
   2141   step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
   2142   step2[31] = WRAPLOW(step1[30] + step1[31], bd);
   2143 
   2144   // stage 3
   2145   step1[0] = step2[0];
   2146   step1[1] = step2[1];
   2147   step1[2] = step2[2];
   2148   step1[3] = step2[3];
   2149 
   2150   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   2151   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   2152   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2153   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2154   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   2155   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   2156   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2157   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2158 
   2159   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
   2160   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
   2161   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
   2162   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
   2163   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
   2164   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
   2165   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
   2166   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
   2167 
   2168   step1[16] = step2[16];
   2169   step1[31] = step2[31];
   2170   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   2171   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
   2172   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2173   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2174   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   2175   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
   2176   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2177   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2178   step1[19] = step2[19];
   2179   step1[20] = step2[20];
   2180   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   2181   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
   2182   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2183   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2184   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   2185   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
   2186   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2187   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2188   step1[23] = step2[23];
   2189   step1[24] = step2[24];
   2190   step1[27] = step2[27];
   2191   step1[28] = step2[28];
   2192 
   2193   // stage 4
   2194   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   2195   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   2196   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2197   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2198   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   2199   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   2200   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2201   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2202   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   2203   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   2204   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
   2205   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
   2206 
   2207   step2[8] = step1[8];
   2208   step2[15] = step1[15];
   2209   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   2210   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   2211   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2212   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2213   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   2214   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   2215   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2216   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2217   step2[11] = step1[11];
   2218   step2[12] = step1[12];
   2219 
   2220   step2[16] = WRAPLOW(step1[16] + step1[19], bd);
   2221   step2[17] = WRAPLOW(step1[17] + step1[18], bd);
   2222   step2[18] = WRAPLOW(step1[17] - step1[18], bd);
   2223   step2[19] = WRAPLOW(step1[16] - step1[19], bd);
   2224   step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
   2225   step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
   2226   step2[22] = WRAPLOW(step1[21] + step1[22], bd);
   2227   step2[23] = WRAPLOW(step1[20] + step1[23], bd);
   2228 
   2229   step2[24] = WRAPLOW(step1[24] + step1[27], bd);
   2230   step2[25] = WRAPLOW(step1[25] + step1[26], bd);
   2231   step2[26] = WRAPLOW(step1[25] - step1[26], bd);
   2232   step2[27] = WRAPLOW(step1[24] - step1[27], bd);
   2233   step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
   2234   step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
   2235   step2[30] = WRAPLOW(step1[29] + step1[30], bd);
   2236   step2[31] = WRAPLOW(step1[28] + step1[31], bd);
   2237 
   2238   // stage 5
   2239   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
   2240   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
   2241   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
   2242   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
   2243   step1[4] = step2[4];
   2244   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   2245   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   2246   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2247   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2248   step1[7] = step2[7];
   2249 
   2250   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
   2251   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
   2252   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
   2253   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
   2254   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
   2255   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
   2256   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
   2257   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
   2258 
   2259   step1[16] = step2[16];
   2260   step1[17] = step2[17];
   2261   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   2262   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   2263   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2264   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2265   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   2266   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   2267   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2268   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2269   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   2270   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   2271   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2272   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2273   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   2274   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   2275   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2276   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2277   step1[22] = step2[22];
   2278   step1[23] = step2[23];
   2279   step1[24] = step2[24];
   2280   step1[25] = step2[25];
   2281   step1[30] = step2[30];
   2282   step1[31] = step2[31];
   2283 
   2284   // stage 6
   2285   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
   2286   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
   2287   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
   2288   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
   2289   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
   2290   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
   2291   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
   2292   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
   2293   step2[8] = step1[8];
   2294   step2[9] = step1[9];
   2295   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   2296   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   2297   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2298   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2299   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   2300   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   2301   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2302   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2303   step2[14] = step1[14];
   2304   step2[15] = step1[15];
   2305 
   2306   step2[16] = WRAPLOW(step1[16] + step1[23], bd);
   2307   step2[17] = WRAPLOW(step1[17] + step1[22], bd);
   2308   step2[18] = WRAPLOW(step1[18] + step1[21], bd);
   2309   step2[19] = WRAPLOW(step1[19] + step1[20], bd);
   2310   step2[20] = WRAPLOW(step1[19] - step1[20], bd);
   2311   step2[21] = WRAPLOW(step1[18] - step1[21], bd);
   2312   step2[22] = WRAPLOW(step1[17] - step1[22], bd);
   2313   step2[23] = WRAPLOW(step1[16] - step1[23], bd);
   2314 
   2315   step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
   2316   step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
   2317   step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
   2318   step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
   2319   step2[28] = WRAPLOW(step1[27] + step1[28], bd);
   2320   step2[29] = WRAPLOW(step1[26] + step1[29], bd);
   2321   step2[30] = WRAPLOW(step1[25] + step1[30], bd);
   2322   step2[31] = WRAPLOW(step1[24] + step1[31], bd);
   2323 
   2324   // stage 7
   2325   step1[0] = WRAPLOW(step2[0] + step2[15], bd);
   2326   step1[1] = WRAPLOW(step2[1] + step2[14], bd);
   2327   step1[2] = WRAPLOW(step2[2] + step2[13], bd);
   2328   step1[3] = WRAPLOW(step2[3] + step2[12], bd);
   2329   step1[4] = WRAPLOW(step2[4] + step2[11], bd);
   2330   step1[5] = WRAPLOW(step2[5] + step2[10], bd);
   2331   step1[6] = WRAPLOW(step2[6] + step2[9], bd);
   2332   step1[7] = WRAPLOW(step2[7] + step2[8], bd);
   2333   step1[8] = WRAPLOW(step2[7] - step2[8], bd);
   2334   step1[9] = WRAPLOW(step2[6] - step2[9], bd);
   2335   step1[10] = WRAPLOW(step2[5] - step2[10], bd);
   2336   step1[11] = WRAPLOW(step2[4] - step2[11], bd);
   2337   step1[12] = WRAPLOW(step2[3] - step2[12], bd);
   2338   step1[13] = WRAPLOW(step2[2] - step2[13], bd);
   2339   step1[14] = WRAPLOW(step2[1] - step2[14], bd);
   2340   step1[15] = WRAPLOW(step2[0] - step2[15], bd);
   2341 
   2342   step1[16] = step2[16];
   2343   step1[17] = step2[17];
   2344   step1[18] = step2[18];
   2345   step1[19] = step2[19];
   2346   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   2347   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   2348   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2349   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2350   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   2351   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   2352   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2353   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2354   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   2355   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   2356   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2357   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2358   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   2359   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   2360   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
   2361   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   2362   step1[28] = step2[28];
   2363   step1[29] = step2[29];
   2364   step1[30] = step2[30];
   2365   step1[31] = step2[31];
   2366 
   2367   // final stage
   2368   output[0] = WRAPLOW(step1[0] + step1[31], bd);
   2369   output[1] = WRAPLOW(step1[1] + step1[30], bd);
   2370   output[2] = WRAPLOW(step1[2] + step1[29], bd);
   2371   output[3] = WRAPLOW(step1[3] + step1[28], bd);
   2372   output[4] = WRAPLOW(step1[4] + step1[27], bd);
   2373   output[5] = WRAPLOW(step1[5] + step1[26], bd);
   2374   output[6] = WRAPLOW(step1[6] + step1[25], bd);
   2375   output[7] = WRAPLOW(step1[7] + step1[24], bd);
   2376   output[8] = WRAPLOW(step1[8] + step1[23], bd);
   2377   output[9] = WRAPLOW(step1[9] + step1[22], bd);
   2378   output[10] = WRAPLOW(step1[10] + step1[21], bd);
   2379   output[11] = WRAPLOW(step1[11] + step1[20], bd);
   2380   output[12] = WRAPLOW(step1[12] + step1[19], bd);
   2381   output[13] = WRAPLOW(step1[13] + step1[18], bd);
   2382   output[14] = WRAPLOW(step1[14] + step1[17], bd);
   2383   output[15] = WRAPLOW(step1[15] + step1[16], bd);
   2384   output[16] = WRAPLOW(step1[15] - step1[16], bd);
   2385   output[17] = WRAPLOW(step1[14] - step1[17], bd);
   2386   output[18] = WRAPLOW(step1[13] - step1[18], bd);
   2387   output[19] = WRAPLOW(step1[12] - step1[19], bd);
   2388   output[20] = WRAPLOW(step1[11] - step1[20], bd);
   2389   output[21] = WRAPLOW(step1[10] - step1[21], bd);
   2390   output[22] = WRAPLOW(step1[9] - step1[22], bd);
   2391   output[23] = WRAPLOW(step1[8] - step1[23], bd);
   2392   output[24] = WRAPLOW(step1[7] - step1[24], bd);
   2393   output[25] = WRAPLOW(step1[6] - step1[25], bd);
   2394   output[26] = WRAPLOW(step1[5] - step1[26], bd);
   2395   output[27] = WRAPLOW(step1[4] - step1[27], bd);
   2396   output[28] = WRAPLOW(step1[3] - step1[28], bd);
   2397   output[29] = WRAPLOW(step1[2] - step1[29], bd);
   2398   output[30] = WRAPLOW(step1[1] - step1[30], bd);
   2399   output[31] = WRAPLOW(step1[0] - step1[31], bd);
   2400 }
   2401 
   2402 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
   2403                                      int stride, int bd) {
   2404   tran_low_t out[32 * 32];
   2405   tran_low_t *outptr = out;
   2406   int i, j;
   2407   tran_low_t temp_in[32], temp_out[32];
   2408   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2409 
   2410   // Rows
   2411   for (i = 0; i < 32; ++i) {
   2412     tran_low_t zero_coeff[16];
   2413     for (j = 0; j < 16; ++j)
   2414       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
   2415     for (j = 0; j < 8; ++j)
   2416       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   2417     for (j = 0; j < 4; ++j)
   2418       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   2419     for (j = 0; j < 2; ++j)
   2420       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
   2421 
   2422     if (zero_coeff[0] | zero_coeff[1])
   2423       highbd_idct32_c(input, outptr, bd);
   2424     else
   2425       memset(outptr, 0, sizeof(tran_low_t) * 32);
   2426     input += 32;
   2427     outptr += 32;
   2428   }
   2429 
   2430   // Columns
   2431   for (i = 0; i < 32; ++i) {
   2432     for (j = 0; j < 32; ++j)
   2433       temp_in[j] = out[j * 32 + i];
   2434     highbd_idct32_c(temp_in, temp_out, bd);
   2435     for (j = 0; j < 32; ++j) {
   2436       dest[j * stride + i] = highbd_clip_pixel_add(
   2437           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2438     }
   2439   }
   2440 }
   2441 
   2442 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   2443                                    int stride, int bd) {
   2444   tran_low_t out[32 * 32] = {0};
   2445   tran_low_t *outptr = out;
   2446   int i, j;
   2447   tran_low_t temp_in[32], temp_out[32];
   2448   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2449 
   2450   // Rows
   2451   // Only upper-left 8x8 has non-zero coeff.
   2452   for (i = 0; i < 8; ++i) {
   2453     highbd_idct32_c(input, outptr, bd);
   2454     input += 32;
   2455     outptr += 32;
   2456   }
   2457   // Columns
   2458   for (i = 0; i < 32; ++i) {
   2459     for (j = 0; j < 32; ++j)
   2460       temp_in[j] = out[j * 32 + i];
   2461     highbd_idct32_c(temp_in, temp_out, bd);
   2462     for (j = 0; j < 32; ++j) {
   2463       dest[j * stride + i] = highbd_clip_pixel_add(
   2464           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2465     }
   2466   }
   2467 }
   2468 
   2469 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   2470                                   int stride, int bd) {
   2471   int i, j;
   2472   int a1;
   2473   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   2474 
   2475   tran_low_t out = WRAPLOW(
   2476       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   2477   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   2478   a1 = ROUND_POWER_OF_TWO(out, 6);
   2479 
   2480   for (j = 0; j < 32; ++j) {
   2481     for (i = 0; i < 32; ++i)
   2482       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   2483     dest += stride;
   2484   }
   2485 }
   2486 #endif  // CONFIG_VP9_HIGHBITDEPTH
   2487