Home | History | Annotate | Download | only in vpx_dsp
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <math.h>
     12 #include <stdlib.h>
     13 #include <string.h>
     14 
     15 #include "./vpx_dsp_rtcd.h"
     16 #include "vpx_dsp/inv_txfm.h"
     17 
     18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     19   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     20      0.5 shifts per pixel. */
     21   int i;
     22   tran_low_t output[16];
     23   tran_high_t a1, b1, c1, d1, e1;
     24   const tran_low_t *ip = input;
     25   tran_low_t *op = output;
     26 
     27   for (i = 0; i < 4; i++) {
     28     a1 = ip[0] >> UNIT_QUANT_SHIFT;
     29     c1 = ip[1] >> UNIT_QUANT_SHIFT;
     30     d1 = ip[2] >> UNIT_QUANT_SHIFT;
     31     b1 = ip[3] >> UNIT_QUANT_SHIFT;
     32     a1 += c1;
     33     d1 -= b1;
     34     e1 = (a1 - d1) >> 1;
     35     b1 = e1 - b1;
     36     c1 = e1 - c1;
     37     a1 -= b1;
     38     d1 += c1;
     39     op[0] = WRAPLOW(a1);
     40     op[1] = WRAPLOW(b1);
     41     op[2] = WRAPLOW(c1);
     42     op[3] = WRAPLOW(d1);
     43     ip += 4;
     44     op += 4;
     45   }
     46 
     47   ip = output;
     48   for (i = 0; i < 4; i++) {
     49     a1 = ip[4 * 0];
     50     c1 = ip[4 * 1];
     51     d1 = ip[4 * 2];
     52     b1 = ip[4 * 3];
     53     a1 += c1;
     54     d1 -= b1;
     55     e1 = (a1 - d1) >> 1;
     56     b1 = e1 - b1;
     57     c1 = e1 - c1;
     58     a1 -= b1;
     59     d1 += c1;
     60     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
     61     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
     62     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
     63     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
     64 
     65     ip++;
     66     dest++;
     67   }
     68 }
     69 
     70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
     71   int i;
     72   tran_high_t a1, e1;
     73   tran_low_t tmp[4];
     74   const tran_low_t *ip = in;
     75   tran_low_t *op = tmp;
     76 
     77   a1 = ip[0] >> UNIT_QUANT_SHIFT;
     78   e1 = a1 >> 1;
     79   a1 -= e1;
     80   op[0] = WRAPLOW(a1);
     81   op[1] = op[2] = op[3] = WRAPLOW(e1);
     82 
     83   ip = tmp;
     84   for (i = 0; i < 4; i++) {
     85     e1 = ip[0] >> 1;
     86     a1 = ip[0] - e1;
     87     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
     88     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
     89     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
     90     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
     91     ip++;
     92     dest++;
     93   }
     94 }
     95 
     96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
     97   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
     98   tran_low_t x0 = input[0];
     99   tran_low_t x1 = input[1];
    100   tran_low_t x2 = input[2];
    101   tran_low_t x3 = input[3];
    102 
    103   if (!(x0 | x1 | x2 | x3)) {
    104     memset(output, 0, 4 * sizeof(*output));
    105     return;
    106   }
    107 
    108   s0 = sinpi_1_9 * x0;
    109   s1 = sinpi_2_9 * x0;
    110   s2 = sinpi_3_9 * x1;
    111   s3 = sinpi_4_9 * x2;
    112   s4 = sinpi_1_9 * x2;
    113   s5 = sinpi_2_9 * x3;
    114   s6 = sinpi_4_9 * x3;
    115   s7 = WRAPLOW(x0 - x2 + x3);
    116 
    117   s0 = s0 + s3 + s5;
    118   s1 = s1 - s4 - s6;
    119   s3 = s2;
    120   s2 = sinpi_3_9 * s7;
    121 
    122   // 1-D transform scaling factor is sqrt(2).
    123   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
    124   // + 1b (addition) = 29b.
    125   // Hence the output bit depth is 15b.
    126   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
    127   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
    128   output[2] = WRAPLOW(dct_const_round_shift(s2));
    129   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
    130 }
    131 
    132 void idct4_c(const tran_low_t *input, tran_low_t *output) {
    133   tran_low_t step[4];
    134   tran_high_t temp1, temp2;
    135 
    136   // stage 1
    137   temp1 = (input[0] + input[2]) * cospi_16_64;
    138   temp2 = (input[0] - input[2]) * cospi_16_64;
    139   step[0] = WRAPLOW(dct_const_round_shift(temp1));
    140   step[1] = WRAPLOW(dct_const_round_shift(temp2));
    141   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
    142   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
    143   step[2] = WRAPLOW(dct_const_round_shift(temp1));
    144   step[3] = WRAPLOW(dct_const_round_shift(temp2));
    145 
    146   // stage 2
    147   output[0] = WRAPLOW(step[0] + step[3]);
    148   output[1] = WRAPLOW(step[1] + step[2]);
    149   output[2] = WRAPLOW(step[1] - step[2]);
    150   output[3] = WRAPLOW(step[0] - step[3]);
    151 }
    152 
    153 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    154   int i, j;
    155   tran_low_t out[4 * 4];
    156   tran_low_t *outptr = out;
    157   tran_low_t temp_in[4], temp_out[4];
    158 
    159   // Rows
    160   for (i = 0; i < 4; ++i) {
    161     idct4_c(input, outptr);
    162     input += 4;
    163     outptr += 4;
    164   }
    165 
    166   // Columns
    167   for (i = 0; i < 4; ++i) {
    168     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
    169     idct4_c(temp_in, temp_out);
    170     for (j = 0; j < 4; ++j) {
    171       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    172                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
    173     }
    174   }
    175 }
    176 
    177 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    178   int i;
    179   tran_high_t a1;
    180   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
    181 
    182   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    183   a1 = ROUND_POWER_OF_TWO(out, 4);
    184 
    185   for (i = 0; i < 4; i++) {
    186     dest[0] = clip_pixel_add(dest[0], a1);
    187     dest[1] = clip_pixel_add(dest[1], a1);
    188     dest[2] = clip_pixel_add(dest[2], a1);
    189     dest[3] = clip_pixel_add(dest[3], a1);
    190     dest += stride;
    191   }
    192 }
    193 
    194 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
    195   int s0, s1, s2, s3, s4, s5, s6, s7;
    196   tran_high_t x0 = input[7];
    197   tran_high_t x1 = input[0];
    198   tran_high_t x2 = input[5];
    199   tran_high_t x3 = input[2];
    200   tran_high_t x4 = input[3];
    201   tran_high_t x5 = input[4];
    202   tran_high_t x6 = input[1];
    203   tran_high_t x7 = input[6];
    204 
    205   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    206     memset(output, 0, 8 * sizeof(*output));
    207     return;
    208   }
    209 
    210   // stage 1
    211   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
    212   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
    213   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
    214   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
    215   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
    216   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
    217   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
    218   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
    219 
    220   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
    221   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
    222   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
    223   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
    224   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
    225   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
    226   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
    227   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
    228 
    229   // stage 2
    230   s0 = (int)x0;
    231   s1 = (int)x1;
    232   s2 = (int)x2;
    233   s3 = (int)x3;
    234   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
    235   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
    236   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
    237   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
    238 
    239   x0 = WRAPLOW(s0 + s2);
    240   x1 = WRAPLOW(s1 + s3);
    241   x2 = WRAPLOW(s0 - s2);
    242   x3 = WRAPLOW(s1 - s3);
    243   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
    244   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
    245   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
    246   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
    247 
    248   // stage 3
    249   s2 = (int)(cospi_16_64 * (x2 + x3));
    250   s3 = (int)(cospi_16_64 * (x2 - x3));
    251   s6 = (int)(cospi_16_64 * (x6 + x7));
    252   s7 = (int)(cospi_16_64 * (x6 - x7));
    253 
    254   x2 = WRAPLOW(dct_const_round_shift(s2));
    255   x3 = WRAPLOW(dct_const_round_shift(s3));
    256   x6 = WRAPLOW(dct_const_round_shift(s6));
    257   x7 = WRAPLOW(dct_const_round_shift(s7));
    258 
    259   output[0] = WRAPLOW(x0);
    260   output[1] = WRAPLOW(-x4);
    261   output[2] = WRAPLOW(x6);
    262   output[3] = WRAPLOW(-x2);
    263   output[4] = WRAPLOW(x3);
    264   output[5] = WRAPLOW(-x7);
    265   output[6] = WRAPLOW(x5);
    266   output[7] = WRAPLOW(-x1);
    267 }
    268 
    269 void idct8_c(const tran_low_t *input, tran_low_t *output) {
    270   tran_low_t step1[8], step2[8];
    271   tran_high_t temp1, temp2;
    272 
    273   // stage 1
    274   step1[0] = input[0];
    275   step1[2] = input[4];
    276   step1[1] = input[2];
    277   step1[3] = input[6];
    278   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    279   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    280   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
    281   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
    282   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    283   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    284   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    285   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    286 
    287   // stage 2
    288   temp1 = (step1[0] + step1[2]) * cospi_16_64;
    289   temp2 = (step1[0] - step1[2]) * cospi_16_64;
    290   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
    291   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
    292   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
    293   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
    294   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
    295   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
    296   step2[4] = WRAPLOW(step1[4] + step1[5]);
    297   step2[5] = WRAPLOW(step1[4] - step1[5]);
    298   step2[6] = WRAPLOW(-step1[6] + step1[7]);
    299   step2[7] = WRAPLOW(step1[6] + step1[7]);
    300 
    301   // stage 3
    302   step1[0] = WRAPLOW(step2[0] + step2[3]);
    303   step1[1] = WRAPLOW(step2[1] + step2[2]);
    304   step1[2] = WRAPLOW(step2[1] - step2[2]);
    305   step1[3] = WRAPLOW(step2[0] - step2[3]);
    306   step1[4] = step2[4];
    307   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    308   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    309   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    310   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    311   step1[7] = step2[7];
    312 
    313   // stage 4
    314   output[0] = WRAPLOW(step1[0] + step1[7]);
    315   output[1] = WRAPLOW(step1[1] + step1[6]);
    316   output[2] = WRAPLOW(step1[2] + step1[5]);
    317   output[3] = WRAPLOW(step1[3] + step1[4]);
    318   output[4] = WRAPLOW(step1[3] - step1[4]);
    319   output[5] = WRAPLOW(step1[2] - step1[5]);
    320   output[6] = WRAPLOW(step1[1] - step1[6]);
    321   output[7] = WRAPLOW(step1[0] - step1[7]);
    322 }
    323 
    324 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    325   int i, j;
    326   tran_low_t out[8 * 8];
    327   tran_low_t *outptr = out;
    328   tran_low_t temp_in[8], temp_out[8];
    329 
    330   // First transform rows
    331   for (i = 0; i < 8; ++i) {
    332     idct8_c(input, outptr);
    333     input += 8;
    334     outptr += 8;
    335   }
    336 
    337   // Then transform columns
    338   for (i = 0; i < 8; ++i) {
    339     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    340     idct8_c(temp_in, temp_out);
    341     for (j = 0; j < 8; ++j) {
    342       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    343                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
    344     }
    345   }
    346 }
    347 
    348 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    349   int i, j;
    350   tran_low_t out[8 * 8] = { 0 };
    351   tran_low_t *outptr = out;
    352   tran_low_t temp_in[8], temp_out[8];
    353 
    354   // First transform rows
    355   // Only first 4 row has non-zero coefs
    356   for (i = 0; i < 4; ++i) {
    357     idct8_c(input, outptr);
    358     input += 8;
    359     outptr += 8;
    360   }
    361 
    362   // Then transform columns
    363   for (i = 0; i < 8; ++i) {
    364     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    365     idct8_c(temp_in, temp_out);
    366     for (j = 0; j < 8; ++j) {
    367       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    368                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
    369     }
    370   }
    371 }
    372 
    373 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    374   int i, j;
    375   tran_high_t a1;
    376   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
    377 
    378   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    379   a1 = ROUND_POWER_OF_TWO(out, 5);
    380   for (j = 0; j < 8; ++j) {
    381     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    382     dest += stride;
    383   }
    384 }
    385 
    386 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
    387   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    388   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    389   tran_high_t x0 = input[15];
    390   tran_high_t x1 = input[0];
    391   tran_high_t x2 = input[13];
    392   tran_high_t x3 = input[2];
    393   tran_high_t x4 = input[11];
    394   tran_high_t x5 = input[4];
    395   tran_high_t x6 = input[9];
    396   tran_high_t x7 = input[6];
    397   tran_high_t x8 = input[7];
    398   tran_high_t x9 = input[8];
    399   tran_high_t x10 = input[5];
    400   tran_high_t x11 = input[10];
    401   tran_high_t x12 = input[3];
    402   tran_high_t x13 = input[12];
    403   tran_high_t x14 = input[1];
    404   tran_high_t x15 = input[14];
    405 
    406   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
    407         x13 | x14 | x15)) {
    408     memset(output, 0, 16 * sizeof(*output));
    409     return;
    410   }
    411 
    412   // stage 1
    413   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
    414   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    415   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
    416   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    417   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
    418   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    419   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    420   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    421   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    422   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    423   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    424   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    425   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    426   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
    427   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    428   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
    429 
    430   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
    431   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
    432   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
    433   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
    434   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
    435   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
    436   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
    437   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
    438   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
    439   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
    440   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
    441   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
    442   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
    443   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
    444   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
    445   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
    446 
    447   // stage 2
    448   s0 = x0;
    449   s1 = x1;
    450   s2 = x2;
    451   s3 = x3;
    452   s4 = x4;
    453   s5 = x5;
    454   s6 = x6;
    455   s7 = x7;
    456   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
    457   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
    458   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
    459   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
    460   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
    461   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
    462   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
    463   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
    464 
    465   x0 = WRAPLOW(s0 + s4);
    466   x1 = WRAPLOW(s1 + s5);
    467   x2 = WRAPLOW(s2 + s6);
    468   x3 = WRAPLOW(s3 + s7);
    469   x4 = WRAPLOW(s0 - s4);
    470   x5 = WRAPLOW(s1 - s5);
    471   x6 = WRAPLOW(s2 - s6);
    472   x7 = WRAPLOW(s3 - s7);
    473   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
    474   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
    475   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
    476   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
    477   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
    478   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
    479   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
    480   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
    481 
    482   // stage 3
    483   s0 = x0;
    484   s1 = x1;
    485   s2 = x2;
    486   s3 = x3;
    487   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
    488   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    489   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
    490   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
    491   s8 = x8;
    492   s9 = x9;
    493   s10 = x10;
    494   s11 = x11;
    495   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
    496   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    497   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
    498   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
    499 
    500   x0 = WRAPLOW(s0 + s2);
    501   x1 = WRAPLOW(s1 + s3);
    502   x2 = WRAPLOW(s0 - s2);
    503   x3 = WRAPLOW(s1 - s3);
    504   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
    505   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
    506   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
    507   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
    508   x8 = WRAPLOW(s8 + s10);
    509   x9 = WRAPLOW(s9 + s11);
    510   x10 = WRAPLOW(s8 - s10);
    511   x11 = WRAPLOW(s9 - s11);
    512   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
    513   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
    514   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
    515   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
    516 
    517   // stage 4
    518   s2 = (-cospi_16_64) * (x2 + x3);
    519   s3 = cospi_16_64 * (x2 - x3);
    520   s6 = cospi_16_64 * (x6 + x7);
    521   s7 = cospi_16_64 * (-x6 + x7);
    522   s10 = cospi_16_64 * (x10 + x11);
    523   s11 = cospi_16_64 * (-x10 + x11);
    524   s14 = (-cospi_16_64) * (x14 + x15);
    525   s15 = cospi_16_64 * (x14 - x15);
    526 
    527   x2 = WRAPLOW(dct_const_round_shift(s2));
    528   x3 = WRAPLOW(dct_const_round_shift(s3));
    529   x6 = WRAPLOW(dct_const_round_shift(s6));
    530   x7 = WRAPLOW(dct_const_round_shift(s7));
    531   x10 = WRAPLOW(dct_const_round_shift(s10));
    532   x11 = WRAPLOW(dct_const_round_shift(s11));
    533   x14 = WRAPLOW(dct_const_round_shift(s14));
    534   x15 = WRAPLOW(dct_const_round_shift(s15));
    535 
    536   output[0] = WRAPLOW(x0);
    537   output[1] = WRAPLOW(-x8);
    538   output[2] = WRAPLOW(x12);
    539   output[3] = WRAPLOW(-x4);
    540   output[4] = WRAPLOW(x6);
    541   output[5] = WRAPLOW(x14);
    542   output[6] = WRAPLOW(x10);
    543   output[7] = WRAPLOW(x2);
    544   output[8] = WRAPLOW(x3);
    545   output[9] = WRAPLOW(x11);
    546   output[10] = WRAPLOW(x15);
    547   output[11] = WRAPLOW(x7);
    548   output[12] = WRAPLOW(x5);
    549   output[13] = WRAPLOW(-x13);
    550   output[14] = WRAPLOW(x9);
    551   output[15] = WRAPLOW(-x1);
    552 }
    553 
    554 void idct16_c(const tran_low_t *input, tran_low_t *output) {
    555   tran_low_t step1[16], step2[16];
    556   tran_high_t temp1, temp2;
    557 
    558   // stage 1
    559   step1[0] = input[0 / 2];
    560   step1[1] = input[16 / 2];
    561   step1[2] = input[8 / 2];
    562   step1[3] = input[24 / 2];
    563   step1[4] = input[4 / 2];
    564   step1[5] = input[20 / 2];
    565   step1[6] = input[12 / 2];
    566   step1[7] = input[28 / 2];
    567   step1[8] = input[2 / 2];
    568   step1[9] = input[18 / 2];
    569   step1[10] = input[10 / 2];
    570   step1[11] = input[26 / 2];
    571   step1[12] = input[6 / 2];
    572   step1[13] = input[22 / 2];
    573   step1[14] = input[14 / 2];
    574   step1[15] = input[30 / 2];
    575 
    576   // stage 2
    577   step2[0] = step1[0];
    578   step2[1] = step1[1];
    579   step2[2] = step1[2];
    580   step2[3] = step1[3];
    581   step2[4] = step1[4];
    582   step2[5] = step1[5];
    583   step2[6] = step1[6];
    584   step2[7] = step1[7];
    585 
    586   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    587   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    588   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
    589   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
    590 
    591   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    592   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    593   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    594   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    595 
    596   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    597   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    598   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    599   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    600 
    601   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    602   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    603   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    604   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    605 
    606   // stage 3
    607   step1[0] = step2[0];
    608   step1[1] = step2[1];
    609   step1[2] = step2[2];
    610   step1[3] = step2[3];
    611 
    612   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    613   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    614   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
    615   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
    616   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    617   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    618   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    619   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    620 
    621   step1[8] = WRAPLOW(step2[8] + step2[9]);
    622   step1[9] = WRAPLOW(step2[8] - step2[9]);
    623   step1[10] = WRAPLOW(-step2[10] + step2[11]);
    624   step1[11] = WRAPLOW(step2[10] + step2[11]);
    625   step1[12] = WRAPLOW(step2[12] + step2[13]);
    626   step1[13] = WRAPLOW(step2[12] - step2[13]);
    627   step1[14] = WRAPLOW(-step2[14] + step2[15]);
    628   step1[15] = WRAPLOW(step2[14] + step2[15]);
    629 
    630   // stage 4
    631   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    632   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    633   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
    634   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
    635   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    636   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    637   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
    638   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
    639   step2[4] = WRAPLOW(step1[4] + step1[5]);
    640   step2[5] = WRAPLOW(step1[4] - step1[5]);
    641   step2[6] = WRAPLOW(-step1[6] + step1[7]);
    642   step2[7] = WRAPLOW(step1[6] + step1[7]);
    643 
    644   step2[8] = step1[8];
    645   step2[15] = step1[15];
    646   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    647   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    648   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    649   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    650   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    651   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    652   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    653   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    654   step2[11] = step1[11];
    655   step2[12] = step1[12];
    656 
    657   // stage 5
    658   step1[0] = WRAPLOW(step2[0] + step2[3]);
    659   step1[1] = WRAPLOW(step2[1] + step2[2]);
    660   step1[2] = WRAPLOW(step2[1] - step2[2]);
    661   step1[3] = WRAPLOW(step2[0] - step2[3]);
    662   step1[4] = step2[4];
    663   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    664   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    665   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    666   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    667   step1[7] = step2[7];
    668 
    669   step1[8] = WRAPLOW(step2[8] + step2[11]);
    670   step1[9] = WRAPLOW(step2[9] + step2[10]);
    671   step1[10] = WRAPLOW(step2[9] - step2[10]);
    672   step1[11] = WRAPLOW(step2[8] - step2[11]);
    673   step1[12] = WRAPLOW(-step2[12] + step2[15]);
    674   step1[13] = WRAPLOW(-step2[13] + step2[14]);
    675   step1[14] = WRAPLOW(step2[13] + step2[14]);
    676   step1[15] = WRAPLOW(step2[12] + step2[15]);
    677 
    678   // stage 6
    679   step2[0] = WRAPLOW(step1[0] + step1[7]);
    680   step2[1] = WRAPLOW(step1[1] + step1[6]);
    681   step2[2] = WRAPLOW(step1[2] + step1[5]);
    682   step2[3] = WRAPLOW(step1[3] + step1[4]);
    683   step2[4] = WRAPLOW(step1[3] - step1[4]);
    684   step2[5] = WRAPLOW(step1[2] - step1[5]);
    685   step2[6] = WRAPLOW(step1[1] - step1[6]);
    686   step2[7] = WRAPLOW(step1[0] - step1[7]);
    687   step2[8] = step1[8];
    688   step2[9] = step1[9];
    689   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    690   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    691   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    692   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    693   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    694   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    695   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    696   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    697   step2[14] = step1[14];
    698   step2[15] = step1[15];
    699 
    700   // stage 7
    701   output[0] = WRAPLOW(step2[0] + step2[15]);
    702   output[1] = WRAPLOW(step2[1] + step2[14]);
    703   output[2] = WRAPLOW(step2[2] + step2[13]);
    704   output[3] = WRAPLOW(step2[3] + step2[12]);
    705   output[4] = WRAPLOW(step2[4] + step2[11]);
    706   output[5] = WRAPLOW(step2[5] + step2[10]);
    707   output[6] = WRAPLOW(step2[6] + step2[9]);
    708   output[7] = WRAPLOW(step2[7] + step2[8]);
    709   output[8] = WRAPLOW(step2[7] - step2[8]);
    710   output[9] = WRAPLOW(step2[6] - step2[9]);
    711   output[10] = WRAPLOW(step2[5] - step2[10]);
    712   output[11] = WRAPLOW(step2[4] - step2[11]);
    713   output[12] = WRAPLOW(step2[3] - step2[12]);
    714   output[13] = WRAPLOW(step2[2] - step2[13]);
    715   output[14] = WRAPLOW(step2[1] - step2[14]);
    716   output[15] = WRAPLOW(step2[0] - step2[15]);
    717 }
    718 
    719 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
    720                              int stride) {
    721   int i, j;
    722   tran_low_t out[16 * 16];
    723   tran_low_t *outptr = out;
    724   tran_low_t temp_in[16], temp_out[16];
    725 
    726   // First transform rows
    727   for (i = 0; i < 16; ++i) {
    728     idct16_c(input, outptr);
    729     input += 16;
    730     outptr += 16;
    731   }
    732 
    733   // Then transform columns
    734   for (i = 0; i < 16; ++i) {
    735     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    736     idct16_c(temp_in, temp_out);
    737     for (j = 0; j < 16; ++j) {
    738       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    739                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    740     }
    741   }
    742 }
    743 
    744 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
    745                             int stride) {
    746   int i, j;
    747   tran_low_t out[16 * 16] = { 0 };
    748   tran_low_t *outptr = out;
    749   tran_low_t temp_in[16], temp_out[16];
    750 
    751   // First transform rows. Since all non-zero dct coefficients are in
    752   // upper-left 8x8 area, we only need to calculate first 8 rows here.
    753   for (i = 0; i < 8; ++i) {
    754     idct16_c(input, outptr);
    755     input += 16;
    756     outptr += 16;
    757   }
    758 
    759   // Then transform columns
    760   for (i = 0; i < 16; ++i) {
    761     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    762     idct16_c(temp_in, temp_out);
    763     for (j = 0; j < 16; ++j) {
    764       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    765                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    766     }
    767   }
    768 }
    769 
    770 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
    771                             int stride) {
    772   int i, j;
    773   tran_low_t out[16 * 16] = { 0 };
    774   tran_low_t *outptr = out;
    775   tran_low_t temp_in[16], temp_out[16];
    776 
    777   // First transform rows. Since all non-zero dct coefficients are in
    778   // upper-left 4x4 area, we only need to calculate first 4 rows here.
    779   for (i = 0; i < 4; ++i) {
    780     idct16_c(input, outptr);
    781     input += 16;
    782     outptr += 16;
    783   }
    784 
    785   // Then transform columns
    786   for (i = 0; i < 16; ++i) {
    787     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    788     idct16_c(temp_in, temp_out);
    789     for (j = 0; j < 16; ++j) {
    790       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    791                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    792     }
    793   }
    794 }
    795 
    796 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    797   int i, j;
    798   tran_high_t a1;
    799   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
    800 
    801   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    802   a1 = ROUND_POWER_OF_TWO(out, 6);
    803   for (j = 0; j < 16; ++j) {
    804     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    805     dest += stride;
    806   }
    807 }
    808 
    809 void idct32_c(const tran_low_t *input, tran_low_t *output) {
    810   tran_low_t step1[32], step2[32];
    811   tran_high_t temp1, temp2;
    812 
    813   // stage 1
    814   step1[0] = input[0];
    815   step1[1] = input[16];
    816   step1[2] = input[8];
    817   step1[3] = input[24];
    818   step1[4] = input[4];
    819   step1[5] = input[20];
    820   step1[6] = input[12];
    821   step1[7] = input[28];
    822   step1[8] = input[2];
    823   step1[9] = input[18];
    824   step1[10] = input[10];
    825   step1[11] = input[26];
    826   step1[12] = input[6];
    827   step1[13] = input[22];
    828   step1[14] = input[14];
    829   step1[15] = input[30];
    830 
    831   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
    832   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
    833   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
    834   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
    835 
    836   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
    837   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
    838   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
    839   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
    840 
    841   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
    842   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
    843   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
    844   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
    845 
    846   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
    847   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
    848   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
    849   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
    850 
    851   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
    852   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
    853   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
    854   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
    855 
    856   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
    857   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
    858   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    859   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    860 
    861   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
    862   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
    863   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
    864   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
    865 
    866   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
    867   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
    868   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
    869   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
    870 
    871   // stage 2
    872   step2[0] = step1[0];
    873   step2[1] = step1[1];
    874   step2[2] = step1[2];
    875   step2[3] = step1[3];
    876   step2[4] = step1[4];
    877   step2[5] = step1[5];
    878   step2[6] = step1[6];
    879   step2[7] = step1[7];
    880 
    881   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    882   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    883   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
    884   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
    885 
    886   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    887   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    888   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    889   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    890 
    891   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    892   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    893   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    894   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    895 
    896   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    897   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    898   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    899   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    900 
    901   step2[16] = WRAPLOW(step1[16] + step1[17]);
    902   step2[17] = WRAPLOW(step1[16] - step1[17]);
    903   step2[18] = WRAPLOW(-step1[18] + step1[19]);
    904   step2[19] = WRAPLOW(step1[18] + step1[19]);
    905   step2[20] = WRAPLOW(step1[20] + step1[21]);
    906   step2[21] = WRAPLOW(step1[20] - step1[21]);
    907   step2[22] = WRAPLOW(-step1[22] + step1[23]);
    908   step2[23] = WRAPLOW(step1[22] + step1[23]);
    909   step2[24] = WRAPLOW(step1[24] + step1[25]);
    910   step2[25] = WRAPLOW(step1[24] - step1[25]);
    911   step2[26] = WRAPLOW(-step1[26] + step1[27]);
    912   step2[27] = WRAPLOW(step1[26] + step1[27]);
    913   step2[28] = WRAPLOW(step1[28] + step1[29]);
    914   step2[29] = WRAPLOW(step1[28] - step1[29]);
    915   step2[30] = WRAPLOW(-step1[30] + step1[31]);
    916   step2[31] = WRAPLOW(step1[30] + step1[31]);
    917 
    918   // stage 3
    919   step1[0] = step2[0];
    920   step1[1] = step2[1];
    921   step1[2] = step2[2];
    922   step1[3] = step2[3];
    923 
    924   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    925   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    926   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
    927   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
    928   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    929   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    930   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    931   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    932 
    933   step1[8] = WRAPLOW(step2[8] + step2[9]);
    934   step1[9] = WRAPLOW(step2[8] - step2[9]);
    935   step1[10] = WRAPLOW(-step2[10] + step2[11]);
    936   step1[11] = WRAPLOW(step2[10] + step2[11]);
    937   step1[12] = WRAPLOW(step2[12] + step2[13]);
    938   step1[13] = WRAPLOW(step2[12] - step2[13]);
    939   step1[14] = WRAPLOW(-step2[14] + step2[15]);
    940   step1[15] = WRAPLOW(step2[14] + step2[15]);
    941 
    942   step1[16] = step2[16];
    943   step1[31] = step2[31];
    944   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
    945   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
    946   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
    947   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
    948   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
    949   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
    950   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
    951   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
    952   step1[19] = step2[19];
    953   step1[20] = step2[20];
    954   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
    955   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
    956   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    957   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    958   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
    959   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
    960   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
    961   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
    962   step1[23] = step2[23];
    963   step1[24] = step2[24];
    964   step1[27] = step2[27];
    965   step1[28] = step2[28];
    966 
    967   // stage 4
    968   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    969   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    970   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
    971   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
    972   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    973   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    974   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
    975   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
    976   step2[4] = WRAPLOW(step1[4] + step1[5]);
    977   step2[5] = WRAPLOW(step1[4] - step1[5]);
    978   step2[6] = WRAPLOW(-step1[6] + step1[7]);
    979   step2[7] = WRAPLOW(step1[6] + step1[7]);
    980 
    981   step2[8] = step1[8];
    982   step2[15] = step1[15];
    983   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    984   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    985   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    986   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    987   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    988   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    989   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    990   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    991   step2[11] = step1[11];
    992   step2[12] = step1[12];
    993 
    994   step2[16] = WRAPLOW(step1[16] + step1[19]);
    995   step2[17] = WRAPLOW(step1[17] + step1[18]);
    996   step2[18] = WRAPLOW(step1[17] - step1[18]);
    997   step2[19] = WRAPLOW(step1[16] - step1[19]);
    998   step2[20] = WRAPLOW(-step1[20] + step1[23]);
    999   step2[21] = WRAPLOW(-step1[21] + step1[22]);
   1000   step2[22] = WRAPLOW(step1[21] + step1[22]);
   1001   step2[23] = WRAPLOW(step1[20] + step1[23]);
   1002 
   1003   step2[24] = WRAPLOW(step1[24] + step1[27]);
   1004   step2[25] = WRAPLOW(step1[25] + step1[26]);
   1005   step2[26] = WRAPLOW(step1[25] - step1[26]);
   1006   step2[27] = WRAPLOW(step1[24] - step1[27]);
   1007   step2[28] = WRAPLOW(-step1[28] + step1[31]);
   1008   step2[29] = WRAPLOW(-step1[29] + step1[30]);
   1009   step2[30] = WRAPLOW(step1[29] + step1[30]);
   1010   step2[31] = WRAPLOW(step1[28] + step1[31]);
   1011 
   1012   // stage 5
   1013   step1[0] = WRAPLOW(step2[0] + step2[3]);
   1014   step1[1] = WRAPLOW(step2[1] + step2[2]);
   1015   step1[2] = WRAPLOW(step2[1] - step2[2]);
   1016   step1[3] = WRAPLOW(step2[0] - step2[3]);
   1017   step1[4] = step2[4];
   1018   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1019   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1020   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   1021   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   1022   step1[7] = step2[7];
   1023 
   1024   step1[8] = WRAPLOW(step2[8] + step2[11]);
   1025   step1[9] = WRAPLOW(step2[9] + step2[10]);
   1026   step1[10] = WRAPLOW(step2[9] - step2[10]);
   1027   step1[11] = WRAPLOW(step2[8] - step2[11]);
   1028   step1[12] = WRAPLOW(-step2[12] + step2[15]);
   1029   step1[13] = WRAPLOW(-step2[13] + step2[14]);
   1030   step1[14] = WRAPLOW(step2[13] + step2[14]);
   1031   step1[15] = WRAPLOW(step2[12] + step2[15]);
   1032 
   1033   step1[16] = step2[16];
   1034   step1[17] = step2[17];
   1035   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   1036   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   1037   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   1038   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   1039   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   1040   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   1041   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   1042   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   1043   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   1044   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   1045   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   1046   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   1047   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   1048   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   1049   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   1050   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   1051   step1[22] = step2[22];
   1052   step1[23] = step2[23];
   1053   step1[24] = step2[24];
   1054   step1[25] = step2[25];
   1055   step1[30] = step2[30];
   1056   step1[31] = step2[31];
   1057 
   1058   // stage 6
   1059   step2[0] = WRAPLOW(step1[0] + step1[7]);
   1060   step2[1] = WRAPLOW(step1[1] + step1[6]);
   1061   step2[2] = WRAPLOW(step1[2] + step1[5]);
   1062   step2[3] = WRAPLOW(step1[3] + step1[4]);
   1063   step2[4] = WRAPLOW(step1[3] - step1[4]);
   1064   step2[5] = WRAPLOW(step1[2] - step1[5]);
   1065   step2[6] = WRAPLOW(step1[1] - step1[6]);
   1066   step2[7] = WRAPLOW(step1[0] - step1[7]);
   1067   step2[8] = step1[8];
   1068   step2[9] = step1[9];
   1069   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   1070   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   1071   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   1072   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   1073   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   1074   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   1075   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
   1076   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   1077   step2[14] = step1[14];
   1078   step2[15] = step1[15];
   1079 
   1080   step2[16] = WRAPLOW(step1[16] + step1[23]);
   1081   step2[17] = WRAPLOW(step1[17] + step1[22]);
   1082   step2[18] = WRAPLOW(step1[18] + step1[21]);
   1083   step2[19] = WRAPLOW(step1[19] + step1[20]);
   1084   step2[20] = WRAPLOW(step1[19] - step1[20]);
   1085   step2[21] = WRAPLOW(step1[18] - step1[21]);
   1086   step2[22] = WRAPLOW(step1[17] - step1[22]);
   1087   step2[23] = WRAPLOW(step1[16] - step1[23]);
   1088 
   1089   step2[24] = WRAPLOW(-step1[24] + step1[31]);
   1090   step2[25] = WRAPLOW(-step1[25] + step1[30]);
   1091   step2[26] = WRAPLOW(-step1[26] + step1[29]);
   1092   step2[27] = WRAPLOW(-step1[27] + step1[28]);
   1093   step2[28] = WRAPLOW(step1[27] + step1[28]);
   1094   step2[29] = WRAPLOW(step1[26] + step1[29]);
   1095   step2[30] = WRAPLOW(step1[25] + step1[30]);
   1096   step2[31] = WRAPLOW(step1[24] + step1[31]);
   1097 
   1098   // stage 7
   1099   step1[0] = WRAPLOW(step2[0] + step2[15]);
   1100   step1[1] = WRAPLOW(step2[1] + step2[14]);
   1101   step1[2] = WRAPLOW(step2[2] + step2[13]);
   1102   step1[3] = WRAPLOW(step2[3] + step2[12]);
   1103   step1[4] = WRAPLOW(step2[4] + step2[11]);
   1104   step1[5] = WRAPLOW(step2[5] + step2[10]);
   1105   step1[6] = WRAPLOW(step2[6] + step2[9]);
   1106   step1[7] = WRAPLOW(step2[7] + step2[8]);
   1107   step1[8] = WRAPLOW(step2[7] - step2[8]);
   1108   step1[9] = WRAPLOW(step2[6] - step2[9]);
   1109   step1[10] = WRAPLOW(step2[5] - step2[10]);
   1110   step1[11] = WRAPLOW(step2[4] - step2[11]);
   1111   step1[12] = WRAPLOW(step2[3] - step2[12]);
   1112   step1[13] = WRAPLOW(step2[2] - step2[13]);
   1113   step1[14] = WRAPLOW(step2[1] - step2[14]);
   1114   step1[15] = WRAPLOW(step2[0] - step2[15]);
   1115 
   1116   step1[16] = step2[16];
   1117   step1[17] = step2[17];
   1118   step1[18] = step2[18];
   1119   step1[19] = step2[19];
   1120   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   1121   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   1122   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   1123   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   1124   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   1125   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   1126   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   1127   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   1128   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   1129   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   1130   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   1131   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   1132   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   1133   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   1134   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   1135   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   1136   step1[28] = step2[28];
   1137   step1[29] = step2[29];
   1138   step1[30] = step2[30];
   1139   step1[31] = step2[31];
   1140 
   1141   // final stage
   1142   output[0] = WRAPLOW(step1[0] + step1[31]);
   1143   output[1] = WRAPLOW(step1[1] + step1[30]);
   1144   output[2] = WRAPLOW(step1[2] + step1[29]);
   1145   output[3] = WRAPLOW(step1[3] + step1[28]);
   1146   output[4] = WRAPLOW(step1[4] + step1[27]);
   1147   output[5] = WRAPLOW(step1[5] + step1[26]);
   1148   output[6] = WRAPLOW(step1[6] + step1[25]);
   1149   output[7] = WRAPLOW(step1[7] + step1[24]);
   1150   output[8] = WRAPLOW(step1[8] + step1[23]);
   1151   output[9] = WRAPLOW(step1[9] + step1[22]);
   1152   output[10] = WRAPLOW(step1[10] + step1[21]);
   1153   output[11] = WRAPLOW(step1[11] + step1[20]);
   1154   output[12] = WRAPLOW(step1[12] + step1[19]);
   1155   output[13] = WRAPLOW(step1[13] + step1[18]);
   1156   output[14] = WRAPLOW(step1[14] + step1[17]);
   1157   output[15] = WRAPLOW(step1[15] + step1[16]);
   1158   output[16] = WRAPLOW(step1[15] - step1[16]);
   1159   output[17] = WRAPLOW(step1[14] - step1[17]);
   1160   output[18] = WRAPLOW(step1[13] - step1[18]);
   1161   output[19] = WRAPLOW(step1[12] - step1[19]);
   1162   output[20] = WRAPLOW(step1[11] - step1[20]);
   1163   output[21] = WRAPLOW(step1[10] - step1[21]);
   1164   output[22] = WRAPLOW(step1[9] - step1[22]);
   1165   output[23] = WRAPLOW(step1[8] - step1[23]);
   1166   output[24] = WRAPLOW(step1[7] - step1[24]);
   1167   output[25] = WRAPLOW(step1[6] - step1[25]);
   1168   output[26] = WRAPLOW(step1[5] - step1[26]);
   1169   output[27] = WRAPLOW(step1[4] - step1[27]);
   1170   output[28] = WRAPLOW(step1[3] - step1[28]);
   1171   output[29] = WRAPLOW(step1[2] - step1[29]);
   1172   output[30] = WRAPLOW(step1[1] - step1[30]);
   1173   output[31] = WRAPLOW(step1[0] - step1[31]);
   1174 }
   1175 
   1176 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
   1177                               int stride) {
   1178   int i, j;
   1179   tran_low_t out[32 * 32];
   1180   tran_low_t *outptr = out;
   1181   tran_low_t temp_in[32], temp_out[32];
   1182 
   1183   // Rows
   1184   for (i = 0; i < 32; ++i) {
   1185     int16_t zero_coeff = 0;
   1186     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
   1187 
   1188     if (zero_coeff)
   1189       idct32_c(input, outptr);
   1190     else
   1191       memset(outptr, 0, sizeof(tran_low_t) * 32);
   1192     input += 32;
   1193     outptr += 32;
   1194   }
   1195 
   1196   // Columns
   1197   for (i = 0; i < 32; ++i) {
   1198     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   1199     idct32_c(temp_in, temp_out);
   1200     for (j = 0; j < 32; ++j) {
   1201       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1202                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1203     }
   1204   }
   1205 }
   1206 
   1207 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
   1208                              int stride) {
   1209   int i, j;
   1210   tran_low_t out[32 * 32] = { 0 };
   1211   tran_low_t *outptr = out;
   1212   tran_low_t temp_in[32], temp_out[32];
   1213 
   1214   // Rows
   1215   // Only upper-left 16x16 has non-zero coeff
   1216   for (i = 0; i < 16; ++i) {
   1217     idct32_c(input, outptr);
   1218     input += 32;
   1219     outptr += 32;
   1220   }
   1221 
   1222   // Columns
   1223   for (i = 0; i < 32; ++i) {
   1224     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   1225     idct32_c(temp_in, temp_out);
   1226     for (j = 0; j < 32; ++j) {
   1227       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1228                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1229     }
   1230   }
   1231 }
   1232 
   1233 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
   1234                             int stride) {
   1235   int i, j;
   1236   tran_low_t out[32 * 32] = { 0 };
   1237   tran_low_t *outptr = out;
   1238   tran_low_t temp_in[32], temp_out[32];
   1239 
   1240   // Rows
   1241   // Only upper-left 8x8 has non-zero coeff
   1242   for (i = 0; i < 8; ++i) {
   1243     idct32_c(input, outptr);
   1244     input += 32;
   1245     outptr += 32;
   1246   }
   1247 
   1248   // Columns
   1249   for (i = 0; i < 32; ++i) {
   1250     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   1251     idct32_c(temp_in, temp_out);
   1252     for (j = 0; j < 32; ++j) {
   1253       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1254                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1255     }
   1256   }
   1257 }
   1258 
   1259 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   1260   int i, j;
   1261   tran_high_t a1;
   1262   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
   1263 
   1264   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   1265   a1 = ROUND_POWER_OF_TWO(out, 6);
   1266 
   1267   for (j = 0; j < 32; ++j) {
   1268     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
   1269     dest += stride;
   1270   }
   1271 }
   1272 
   1273 #if CONFIG_VP9_HIGHBITDEPTH
   1274 
   1275 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
   1276 // transform amplify bits + 1 bit for contingency in rounding and quantizing
   1277 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
   1278 
   1279 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
   1280                                               int size) {
   1281   int i;
   1282   for (i = 0; i < size; ++i)
   1283     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
   1284   return 0;
   1285 }
   1286 
   1287 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
   1288                                  int stride, int bd) {
   1289   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
   1290      0.5 shifts per pixel. */
   1291   int i;
   1292   tran_low_t output[16];
   1293   tran_high_t a1, b1, c1, d1, e1;
   1294   const tran_low_t *ip = input;
   1295   tran_low_t *op = output;
   1296 
   1297   for (i = 0; i < 4; i++) {
   1298     a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1299     c1 = ip[1] >> UNIT_QUANT_SHIFT;
   1300     d1 = ip[2] >> UNIT_QUANT_SHIFT;
   1301     b1 = ip[3] >> UNIT_QUANT_SHIFT;
   1302     a1 += c1;
   1303     d1 -= b1;
   1304     e1 = (a1 - d1) >> 1;
   1305     b1 = e1 - b1;
   1306     c1 = e1 - c1;
   1307     a1 -= b1;
   1308     d1 += c1;
   1309     op[0] = HIGHBD_WRAPLOW(a1, bd);
   1310     op[1] = HIGHBD_WRAPLOW(b1, bd);
   1311     op[2] = HIGHBD_WRAPLOW(c1, bd);
   1312     op[3] = HIGHBD_WRAPLOW(d1, bd);
   1313     ip += 4;
   1314     op += 4;
   1315   }
   1316 
   1317   ip = output;
   1318   for (i = 0; i < 4; i++) {
   1319     a1 = ip[4 * 0];
   1320     c1 = ip[4 * 1];
   1321     d1 = ip[4 * 2];
   1322     b1 = ip[4 * 3];
   1323     a1 += c1;
   1324     d1 -= b1;
   1325     e1 = (a1 - d1) >> 1;
   1326     b1 = e1 - b1;
   1327     c1 = e1 - c1;
   1328     a1 -= b1;
   1329     d1 += c1;
   1330     dest[stride * 0] =
   1331         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
   1332     dest[stride * 1] =
   1333         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
   1334     dest[stride * 2] =
   1335         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
   1336     dest[stride * 3] =
   1337         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
   1338 
   1339     ip++;
   1340     dest++;
   1341   }
   1342 }
   1343 
   1344 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
   1345                                 int stride, int bd) {
   1346   int i;
   1347   tran_high_t a1, e1;
   1348   tran_low_t tmp[4];
   1349   const tran_low_t *ip = in;
   1350   tran_low_t *op = tmp;
   1351   (void)bd;
   1352 
   1353   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1354   e1 = a1 >> 1;
   1355   a1 -= e1;
   1356   op[0] = HIGHBD_WRAPLOW(a1, bd);
   1357   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
   1358 
   1359   ip = tmp;
   1360   for (i = 0; i < 4; i++) {
   1361     e1 = ip[0] >> 1;
   1362     a1 = ip[0] - e1;
   1363     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
   1364     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
   1365     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
   1366     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
   1367     ip++;
   1368     dest++;
   1369   }
   1370 }
   1371 
   1372 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1373   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1374   tran_low_t x0 = input[0];
   1375   tran_low_t x1 = input[1];
   1376   tran_low_t x2 = input[2];
   1377   tran_low_t x3 = input[3];
   1378   (void)bd;
   1379 
   1380   if (detect_invalid_highbd_input(input, 4)) {
   1381 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1382     assert(0 && "invalid highbd txfm input");
   1383 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1384     memset(output, 0, sizeof(*output) * 4);
   1385     return;
   1386   }
   1387 
   1388   if (!(x0 | x1 | x2 | x3)) {
   1389     memset(output, 0, 4 * sizeof(*output));
   1390     return;
   1391   }
   1392 
   1393   s0 = sinpi_1_9 * x0;
   1394   s1 = sinpi_2_9 * x0;
   1395   s2 = sinpi_3_9 * x1;
   1396   s3 = sinpi_4_9 * x2;
   1397   s4 = sinpi_1_9 * x2;
   1398   s5 = sinpi_2_9 * x3;
   1399   s6 = sinpi_4_9 * x3;
   1400   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
   1401 
   1402   s0 = s0 + s3 + s5;
   1403   s1 = s1 - s4 - s6;
   1404   s3 = s2;
   1405   s2 = sinpi_3_9 * s7;
   1406 
   1407   // 1-D transform scaling factor is sqrt(2).
   1408   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   1409   // + 1b (addition) = 29b.
   1410   // Hence the output bit depth is 15b.
   1411   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
   1412   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
   1413   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   1414   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
   1415 }
   1416 
   1417 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1418   tran_low_t step[4];
   1419   tran_high_t temp1, temp2;
   1420   (void)bd;
   1421 
   1422   if (detect_invalid_highbd_input(input, 4)) {
   1423 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1424     assert(0 && "invalid highbd txfm input");
   1425 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1426     memset(output, 0, sizeof(*output) * 4);
   1427     return;
   1428   }
   1429 
   1430   // stage 1
   1431   temp1 = (input[0] + input[2]) * cospi_16_64;
   1432   temp2 = (input[0] - input[2]) * cospi_16_64;
   1433   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1434   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1435   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   1436   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
   1437   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1438   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1439 
   1440   // stage 2
   1441   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
   1442   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
   1443   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
   1444   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
   1445 }
   1446 
   1447 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
   1448                                  int stride, int bd) {
   1449   int i, j;
   1450   tran_low_t out[4 * 4];
   1451   tran_low_t *outptr = out;
   1452   tran_low_t temp_in[4], temp_out[4];
   1453 
   1454   // Rows
   1455   for (i = 0; i < 4; ++i) {
   1456     vpx_highbd_idct4_c(input, outptr, bd);
   1457     input += 4;
   1458     outptr += 4;
   1459   }
   1460 
   1461   // Columns
   1462   for (i = 0; i < 4; ++i) {
   1463     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
   1464     vpx_highbd_idct4_c(temp_in, temp_out, bd);
   1465     for (j = 0; j < 4; ++j) {
   1466       dest[j * stride + i] = highbd_clip_pixel_add(
   1467           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
   1468     }
   1469   }
   1470 }
   1471 
   1472 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
   1473                                 int stride, int bd) {
   1474   int i;
   1475   tran_high_t a1;
   1476   tran_low_t out =
   1477       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   1478 
   1479   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   1480   a1 = ROUND_POWER_OF_TWO(out, 4);
   1481 
   1482   for (i = 0; i < 4; i++) {
   1483     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
   1484     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
   1485     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
   1486     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
   1487     dest += stride;
   1488   }
   1489 }
   1490 
   1491 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1492   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1493   tran_low_t x0 = input[7];
   1494   tran_low_t x1 = input[0];
   1495   tran_low_t x2 = input[5];
   1496   tran_low_t x3 = input[2];
   1497   tran_low_t x4 = input[3];
   1498   tran_low_t x5 = input[4];
   1499   tran_low_t x6 = input[1];
   1500   tran_low_t x7 = input[6];
   1501   (void)bd;
   1502 
   1503   if (detect_invalid_highbd_input(input, 8)) {
   1504 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1505     assert(0 && "invalid highbd txfm input");
   1506 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1507     memset(output, 0, sizeof(*output) * 8);
   1508     return;
   1509   }
   1510 
   1511   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
   1512     memset(output, 0, 8 * sizeof(*output));
   1513     return;
   1514   }
   1515 
   1516   // stage 1
   1517   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
   1518   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
   1519   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   1520   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   1521   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   1522   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   1523   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
   1524   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
   1525 
   1526   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
   1527   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
   1528   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
   1529   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
   1530   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
   1531   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
   1532   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
   1533   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
   1534 
   1535   // stage 2
   1536   s0 = x0;
   1537   s1 = x1;
   1538   s2 = x2;
   1539   s3 = x3;
   1540   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
   1541   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
   1542   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
   1543   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
   1544 
   1545   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   1546   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   1547   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   1548   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
   1549   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
   1550   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
   1551   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
   1552   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   1553 
   1554   // stage 3
   1555   s2 = cospi_16_64 * (x2 + x3);
   1556   s3 = cospi_16_64 * (x2 - x3);
   1557   s6 = cospi_16_64 * (x6 + x7);
   1558   s7 = cospi_16_64 * (x6 - x7);
   1559 
   1560   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   1561   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
   1562   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
   1563   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
   1564 
   1565   output[0] = HIGHBD_WRAPLOW(x0, bd);
   1566   output[1] = HIGHBD_WRAPLOW(-x4, bd);
   1567   output[2] = HIGHBD_WRAPLOW(x6, bd);
   1568   output[3] = HIGHBD_WRAPLOW(-x2, bd);
   1569   output[4] = HIGHBD_WRAPLOW(x3, bd);
   1570   output[5] = HIGHBD_WRAPLOW(-x7, bd);
   1571   output[6] = HIGHBD_WRAPLOW(x5, bd);
   1572   output[7] = HIGHBD_WRAPLOW(-x1, bd);
   1573 }
   1574 
   1575 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1576   tran_low_t step1[8], step2[8];
   1577   tran_high_t temp1, temp2;
   1578 
   1579   if (detect_invalid_highbd_input(input, 8)) {
   1580 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1581     assert(0 && "invalid highbd txfm input");
   1582 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1583     memset(output, 0, sizeof(*output) * 8);
   1584     return;
   1585   }
   1586 
   1587   // stage 1
   1588   step1[0] = input[0];
   1589   step1[2] = input[4];
   1590   step1[1] = input[2];
   1591   step1[3] = input[6];
   1592   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   1593   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
   1594   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1595   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1596   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   1597   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
   1598   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1599   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1600 
   1601   // stage 2 & stage 3 - even half
   1602   vpx_highbd_idct4_c(step1, step1, bd);
   1603 
   1604   // stage 2 - odd half
   1605   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   1606   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   1607   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
   1608   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
   1609 
   1610   // stage 3 - odd half
   1611   step1[4] = step2[4];
   1612   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1613   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1614   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1615   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1616   step1[7] = step2[7];
   1617 
   1618   // stage 4
   1619   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
   1620   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
   1621   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
   1622   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
   1623   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
   1624   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
   1625   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
   1626   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   1627 }
   1628 
   1629 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
   1630                                  int stride, int bd) {
   1631   int i, j;
   1632   tran_low_t out[8 * 8];
   1633   tran_low_t *outptr = out;
   1634   tran_low_t temp_in[8], temp_out[8];
   1635 
   1636   // First transform rows
   1637   for (i = 0; i < 8; ++i) {
   1638     vpx_highbd_idct8_c(input, outptr, bd);
   1639     input += 8;
   1640     outptr += 8;
   1641   }
   1642 
   1643   // Then transform columns
   1644   for (i = 0; i < 8; ++i) {
   1645     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
   1646     vpx_highbd_idct8_c(temp_in, temp_out, bd);
   1647     for (j = 0; j < 8; ++j) {
   1648       dest[j * stride + i] = highbd_clip_pixel_add(
   1649           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1650     }
   1651   }
   1652 }
   1653 
   1654 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
   1655                                  int stride, int bd) {
   1656   int i, j;
   1657   tran_low_t out[8 * 8] = { 0 };
   1658   tran_low_t *outptr = out;
   1659   tran_low_t temp_in[8], temp_out[8];
   1660 
   1661   // First transform rows
   1662   // Only first 4 row has non-zero coefs
   1663   for (i = 0; i < 4; ++i) {
   1664     vpx_highbd_idct8_c(input, outptr, bd);
   1665     input += 8;
   1666     outptr += 8;
   1667   }
   1668 
   1669   // Then transform columns
   1670   for (i = 0; i < 8; ++i) {
   1671     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
   1672     vpx_highbd_idct8_c(temp_in, temp_out, bd);
   1673     for (j = 0; j < 8; ++j) {
   1674       dest[j * stride + i] = highbd_clip_pixel_add(
   1675           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1676     }
   1677   }
   1678 }
   1679 
   1680 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
   1681                                 int stride, int bd) {
   1682   int i, j;
   1683   tran_high_t a1;
   1684   tran_low_t out =
   1685       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   1686 
   1687   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   1688   a1 = ROUND_POWER_OF_TWO(out, 5);
   1689   for (j = 0; j < 8; ++j) {
   1690     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   1691     dest += stride;
   1692   }
   1693 }
   1694 
   1695 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1696   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   1697   tran_high_t s9, s10, s11, s12, s13, s14, s15;
   1698   tran_low_t x0 = input[15];
   1699   tran_low_t x1 = input[0];
   1700   tran_low_t x2 = input[13];
   1701   tran_low_t x3 = input[2];
   1702   tran_low_t x4 = input[11];
   1703   tran_low_t x5 = input[4];
   1704   tran_low_t x6 = input[9];
   1705   tran_low_t x7 = input[6];
   1706   tran_low_t x8 = input[7];
   1707   tran_low_t x9 = input[8];
   1708   tran_low_t x10 = input[5];
   1709   tran_low_t x11 = input[10];
   1710   tran_low_t x12 = input[3];
   1711   tran_low_t x13 = input[12];
   1712   tran_low_t x14 = input[1];
   1713   tran_low_t x15 = input[14];
   1714   (void)bd;
   1715 
   1716   if (detect_invalid_highbd_input(input, 16)) {
   1717 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1718     assert(0 && "invalid highbd txfm input");
   1719 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1720     memset(output, 0, sizeof(*output) * 16);
   1721     return;
   1722   }
   1723 
   1724   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
   1725         x13 | x14 | x15)) {
   1726     memset(output, 0, 16 * sizeof(*output));
   1727     return;
   1728   }
   1729 
   1730   // stage 1
   1731   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
   1732   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   1733   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
   1734   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   1735   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
   1736   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   1737   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   1738   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   1739   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   1740   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   1741   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   1742   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   1743   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   1744   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
   1745   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   1746   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
   1747 
   1748   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
   1749   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
   1750   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
   1751   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
   1752   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
   1753   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
   1754   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
   1755   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
   1756   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
   1757   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
   1758   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
   1759   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
   1760   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
   1761   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
   1762   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
   1763   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
   1764 
   1765   // stage 2
   1766   s0 = x0;
   1767   s1 = x1;
   1768   s2 = x2;
   1769   s3 = x3;
   1770   s4 = x4;
   1771   s5 = x5;
   1772   s6 = x6;
   1773   s7 = x7;
   1774   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
   1775   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
   1776   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
   1777   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
   1778   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
   1779   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
   1780   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   1781   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
   1782 
   1783   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
   1784   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
   1785   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
   1786   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
   1787   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
   1788   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
   1789   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
   1790   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
   1791   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
   1792   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
   1793   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
   1794   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
   1795   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
   1796   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
   1797   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
   1798   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
   1799 
   1800   // stage 3
   1801   s0 = x0;
   1802   s1 = x1;
   1803   s2 = x2;
   1804   s3 = x3;
   1805   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
   1806   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   1807   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
   1808   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
   1809   s8 = x8;
   1810   s9 = x9;
   1811   s10 = x10;
   1812   s11 = x11;
   1813   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
   1814   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   1815   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   1816   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
   1817 
   1818   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   1819   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   1820   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   1821   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
   1822   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
   1823   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
   1824   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
   1825   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   1826   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
   1827   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
   1828   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
   1829   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
   1830   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
   1831   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
   1832   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
   1833   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
   1834 
   1835   // stage 4
   1836   s2 = (-cospi_16_64) * (x2 + x3);
   1837   s3 = cospi_16_64 * (x2 - x3);
   1838   s6 = cospi_16_64 * (x6 + x7);
   1839   s7 = cospi_16_64 * (-x6 + x7);
   1840   s10 = cospi_16_64 * (x10 + x11);
   1841   s11 = cospi_16_64 * (-x10 + x11);
   1842   s14 = (-cospi_16_64) * (x14 + x15);
   1843   s15 = cospi_16_64 * (x14 - x15);
   1844 
   1845   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   1846   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
   1847   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
   1848   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
   1849   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
   1850   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
   1851   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
   1852   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
   1853 
   1854   output[0] = HIGHBD_WRAPLOW(x0, bd);
   1855   output[1] = HIGHBD_WRAPLOW(-x8, bd);
   1856   output[2] = HIGHBD_WRAPLOW(x12, bd);
   1857   output[3] = HIGHBD_WRAPLOW(-x4, bd);
   1858   output[4] = HIGHBD_WRAPLOW(x6, bd);
   1859   output[5] = HIGHBD_WRAPLOW(x14, bd);
   1860   output[6] = HIGHBD_WRAPLOW(x10, bd);
   1861   output[7] = HIGHBD_WRAPLOW(x2, bd);
   1862   output[8] = HIGHBD_WRAPLOW(x3, bd);
   1863   output[9] = HIGHBD_WRAPLOW(x11, bd);
   1864   output[10] = HIGHBD_WRAPLOW(x15, bd);
   1865   output[11] = HIGHBD_WRAPLOW(x7, bd);
   1866   output[12] = HIGHBD_WRAPLOW(x5, bd);
   1867   output[13] = HIGHBD_WRAPLOW(-x13, bd);
   1868   output[14] = HIGHBD_WRAPLOW(x9, bd);
   1869   output[15] = HIGHBD_WRAPLOW(-x1, bd);
   1870 }
   1871 
   1872 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1873   tran_low_t step1[16], step2[16];
   1874   tran_high_t temp1, temp2;
   1875   (void)bd;
   1876 
   1877   if (detect_invalid_highbd_input(input, 16)) {
   1878 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1879     assert(0 && "invalid highbd txfm input");
   1880 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1881     memset(output, 0, sizeof(*output) * 16);
   1882     return;
   1883   }
   1884 
   1885   // stage 1
   1886   step1[0] = input[0 / 2];
   1887   step1[1] = input[16 / 2];
   1888   step1[2] = input[8 / 2];
   1889   step1[3] = input[24 / 2];
   1890   step1[4] = input[4 / 2];
   1891   step1[5] = input[20 / 2];
   1892   step1[6] = input[12 / 2];
   1893   step1[7] = input[28 / 2];
   1894   step1[8] = input[2 / 2];
   1895   step1[9] = input[18 / 2];
   1896   step1[10] = input[10 / 2];
   1897   step1[11] = input[26 / 2];
   1898   step1[12] = input[6 / 2];
   1899   step1[13] = input[22 / 2];
   1900   step1[14] = input[14 / 2];
   1901   step1[15] = input[30 / 2];
   1902 
   1903   // stage 2
   1904   step2[0] = step1[0];
   1905   step2[1] = step1[1];
   1906   step2[2] = step1[2];
   1907   step2[3] = step1[3];
   1908   step2[4] = step1[4];
   1909   step2[5] = step1[5];
   1910   step2[6] = step1[6];
   1911   step2[7] = step1[7];
   1912 
   1913   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   1914   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   1915   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1916   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1917 
   1918   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   1919   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   1920   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1921   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1922 
   1923   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   1924   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   1925   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1926   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1927 
   1928   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   1929   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   1930   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1931   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1932 
   1933   // stage 3
   1934   step1[0] = step2[0];
   1935   step1[1] = step2[1];
   1936   step1[2] = step2[2];
   1937   step1[3] = step2[3];
   1938 
   1939   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   1940   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   1941   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1942   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1943   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   1944   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   1945   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1946   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1947 
   1948   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   1949   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
   1950   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
   1951   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
   1952   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
   1953   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
   1954   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
   1955   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
   1956 
   1957   // stage 4
   1958   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   1959   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   1960   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1961   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1962   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   1963   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   1964   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1965   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1966   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   1967   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   1968   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
   1969   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
   1970 
   1971   step2[8] = step1[8];
   1972   step2[15] = step1[15];
   1973   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   1974   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   1975   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1976   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1977   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   1978   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   1979   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1980   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1981   step2[11] = step1[11];
   1982   step2[12] = step1[12];
   1983 
   1984   // stage 5
   1985   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
   1986   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
   1987   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   1988   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   1989   step1[4] = step2[4];
   1990   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1991   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1992   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1993   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1994   step1[7] = step2[7];
   1995 
   1996   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
   1997   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
   1998   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
   1999   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
   2000   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
   2001   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
   2002   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
   2003   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
   2004 
   2005   // stage 6
   2006   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
   2007   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
   2008   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
   2009   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
   2010   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
   2011   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
   2012   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
   2013   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   2014   step2[8] = step1[8];
   2015   step2[9] = step1[9];
   2016   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   2017   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   2018   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2019   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2020   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   2021   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   2022   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2023   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2024   step2[14] = step1[14];
   2025   step2[15] = step1[15];
   2026 
   2027   // stage 7
   2028   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
   2029   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
   2030   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
   2031   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
   2032   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
   2033   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
   2034   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
   2035   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
   2036   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
   2037   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
   2038   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
   2039   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
   2040   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
   2041   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
   2042   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
   2043   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
   2044 }
   2045 
   2046 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
   2047                                     int stride, int bd) {
   2048   int i, j;
   2049   tran_low_t out[16 * 16];
   2050   tran_low_t *outptr = out;
   2051   tran_low_t temp_in[16], temp_out[16];
   2052 
   2053   // First transform rows
   2054   for (i = 0; i < 16; ++i) {
   2055     vpx_highbd_idct16_c(input, outptr, bd);
   2056     input += 16;
   2057     outptr += 16;
   2058   }
   2059 
   2060   // Then transform columns
   2061   for (i = 0; i < 16; ++i) {
   2062     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
   2063     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2064     for (j = 0; j < 16; ++j) {
   2065       dest[j * stride + i] = highbd_clip_pixel_add(
   2066           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2067     }
   2068   }
   2069 }
   2070 
   2071 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
   2072                                    int stride, int bd) {
   2073   int i, j;
   2074   tran_low_t out[16 * 16] = { 0 };
   2075   tran_low_t *outptr = out;
   2076   tran_low_t temp_in[16], temp_out[16];
   2077 
   2078   // First transform rows. Since all non-zero dct coefficients are in
   2079   // upper-left 8x8 area, we only need to calculate first 8 rows here.
   2080   for (i = 0; i < 8; ++i) {
   2081     vpx_highbd_idct16_c(input, outptr, bd);
   2082     input += 16;
   2083     outptr += 16;
   2084   }
   2085 
   2086   // Then transform columns
   2087   for (i = 0; i < 16; ++i) {
   2088     uint16_t *destT = dest;
   2089     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
   2090     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2091     for (j = 0; j < 16; ++j) {
   2092       destT[i] = highbd_clip_pixel_add(destT[i],
   2093                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2094       destT += stride;
   2095     }
   2096   }
   2097 }
   2098 
   2099 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
   2100                                    int stride, int bd) {
   2101   int i, j;
   2102   tran_low_t out[16 * 16] = { 0 };
   2103   tran_low_t *outptr = out;
   2104   tran_low_t temp_in[16], temp_out[16];
   2105 
   2106   // First transform rows. Since all non-zero dct coefficients are in
   2107   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   2108   for (i = 0; i < 4; ++i) {
   2109     vpx_highbd_idct16_c(input, outptr, bd);
   2110     input += 16;
   2111     outptr += 16;
   2112   }
   2113 
   2114   // Then transform columns
   2115   for (i = 0; i < 16; ++i) {
   2116     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
   2117     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2118     for (j = 0; j < 16; ++j) {
   2119       dest[j * stride + i] = highbd_clip_pixel_add(
   2120           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2121     }
   2122   }
   2123 }
   2124 
   2125 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
   2126                                   int stride, int bd) {
   2127   int i, j;
   2128   tran_high_t a1;
   2129   tran_low_t out =
   2130       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   2131 
   2132   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   2133   a1 = ROUND_POWER_OF_TWO(out, 6);
   2134   for (j = 0; j < 16; ++j) {
   2135     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   2136     dest += stride;
   2137   }
   2138 }
   2139 
   2140 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   2141                             int bd) {
   2142   tran_low_t step1[32], step2[32];
   2143   tran_high_t temp1, temp2;
   2144   (void)bd;
   2145 
   2146   if (detect_invalid_highbd_input(input, 32)) {
   2147 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   2148     assert(0 && "invalid highbd txfm input");
   2149 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   2150     memset(output, 0, sizeof(*output) * 32);
   2151     return;
   2152   }
   2153 
   2154   // stage 1
   2155   step1[0] = input[0];
   2156   step1[1] = input[16];
   2157   step1[2] = input[8];
   2158   step1[3] = input[24];
   2159   step1[4] = input[4];
   2160   step1[5] = input[20];
   2161   step1[6] = input[12];
   2162   step1[7] = input[28];
   2163   step1[8] = input[2];
   2164   step1[9] = input[18];
   2165   step1[10] = input[10];
   2166   step1[11] = input[26];
   2167   step1[12] = input[6];
   2168   step1[13] = input[22];
   2169   step1[14] = input[14];
   2170   step1[15] = input[30];
   2171 
   2172   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   2173   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
   2174   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2175   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2176 
   2177   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   2178   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
   2179   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2180   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2181 
   2182   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   2183   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
   2184   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2185   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2186 
   2187   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   2188   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
   2189   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2190   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2191 
   2192   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   2193   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
   2194   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2195   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2196 
   2197   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   2198   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
   2199   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2200   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2201 
   2202   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   2203   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
   2204   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2205   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2206 
   2207   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   2208   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
   2209   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2210   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2211 
   2212   // stage 2
   2213   step2[0] = step1[0];
   2214   step2[1] = step1[1];
   2215   step2[2] = step1[2];
   2216   step2[3] = step1[3];
   2217   step2[4] = step1[4];
   2218   step2[5] = step1[5];
   2219   step2[6] = step1[6];
   2220   step2[7] = step1[7];
   2221 
   2222   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   2223   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   2224   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2225   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2226 
   2227   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   2228   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   2229   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2230   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2231 
   2232   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   2233   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   2234   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2235   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2236 
   2237   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   2238   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   2239   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2240   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2241 
   2242   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
   2243   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
   2244   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
   2245   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
   2246   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
   2247   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
   2248   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
   2249   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
   2250   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
   2251   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
   2252   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
   2253   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
   2254   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
   2255   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
   2256   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
   2257   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
   2258 
   2259   // stage 3
   2260   step1[0] = step2[0];
   2261   step1[1] = step2[1];
   2262   step1[2] = step2[2];
   2263   step1[3] = step2[3];
   2264 
   2265   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   2266   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   2267   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2268   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2269   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   2270   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   2271   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2272   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2273 
   2274   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   2275   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
   2276   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
   2277   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
   2278   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
   2279   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
   2280   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
   2281   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
   2282 
   2283   step1[16] = step2[16];
   2284   step1[31] = step2[31];
   2285   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   2286   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
   2287   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2288   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2289   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   2290   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
   2291   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2292   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2293   step1[19] = step2[19];
   2294   step1[20] = step2[20];
   2295   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   2296   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
   2297   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2298   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2299   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   2300   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
   2301   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2302   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2303   step1[23] = step2[23];
   2304   step1[24] = step2[24];
   2305   step1[27] = step2[27];
   2306   step1[28] = step2[28];
   2307 
   2308   // stage 4
   2309   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   2310   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   2311   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2312   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2313   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   2314   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   2315   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2316   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2317   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   2318   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   2319   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
   2320   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
   2321 
   2322   step2[8] = step1[8];
   2323   step2[15] = step1[15];
   2324   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   2325   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   2326   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2327   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2328   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   2329   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   2330   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2331   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2332   step2[11] = step1[11];
   2333   step2[12] = step1[12];
   2334 
   2335   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
   2336   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
   2337   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
   2338   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
   2339   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
   2340   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
   2341   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
   2342   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
   2343 
   2344   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
   2345   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
   2346   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
   2347   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
   2348   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
   2349   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
   2350   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
   2351   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
   2352 
   2353   // stage 5
   2354   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
   2355   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
   2356   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   2357   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   2358   step1[4] = step2[4];
   2359   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   2360   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   2361   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2362   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2363   step1[7] = step2[7];
   2364 
   2365   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
   2366   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
   2367   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
   2368   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
   2369   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
   2370   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
   2371   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
   2372   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
   2373 
   2374   step1[16] = step2[16];
   2375   step1[17] = step2[17];
   2376   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   2377   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   2378   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2379   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2380   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   2381   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   2382   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2383   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2384   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   2385   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   2386   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2387   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2388   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   2389   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   2390   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2391   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2392   step1[22] = step2[22];
   2393   step1[23] = step2[23];
   2394   step1[24] = step2[24];
   2395   step1[25] = step2[25];
   2396   step1[30] = step2[30];
   2397   step1[31] = step2[31];
   2398 
   2399   // stage 6
   2400   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
   2401   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
   2402   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
   2403   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
   2404   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
   2405   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
   2406   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
   2407   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   2408   step2[8] = step1[8];
   2409   step2[9] = step1[9];
   2410   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   2411   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   2412   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2413   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2414   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   2415   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   2416   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2417   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2418   step2[14] = step1[14];
   2419   step2[15] = step1[15];
   2420 
   2421   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
   2422   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
   2423   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
   2424   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
   2425   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
   2426   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
   2427   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
   2428   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
   2429 
   2430   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
   2431   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
   2432   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
   2433   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
   2434   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
   2435   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
   2436   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
   2437   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
   2438 
   2439   // stage 7
   2440   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
   2441   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
   2442   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
   2443   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
   2444   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
   2445   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
   2446   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
   2447   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
   2448   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
   2449   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
   2450   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
   2451   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
   2452   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
   2453   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
   2454   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
   2455   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
   2456 
   2457   step1[16] = step2[16];
   2458   step1[17] = step2[17];
   2459   step1[18] = step2[18];
   2460   step1[19] = step2[19];
   2461   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   2462   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   2463   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2464   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2465   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   2466   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   2467   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2468   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2469   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   2470   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   2471   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2472   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2473   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   2474   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   2475   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2476   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2477   step1[28] = step2[28];
   2478   step1[29] = step2[29];
   2479   step1[30] = step2[30];
   2480   step1[31] = step2[31];
   2481 
   2482   // final stage
   2483   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
   2484   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
   2485   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
   2486   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
   2487   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
   2488   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
   2489   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
   2490   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
   2491   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
   2492   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
   2493   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
   2494   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
   2495   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
   2496   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
   2497   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
   2498   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
   2499   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
   2500   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
   2501   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
   2502   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
   2503   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
   2504   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
   2505   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
   2506   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
   2507   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
   2508   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
   2509   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
   2510   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
   2511   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
   2512   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
   2513   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
   2514   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
   2515 }
   2516 
   2517 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
   2518                                      int stride, int bd) {
   2519   int i, j;
   2520   tran_low_t out[32 * 32];
   2521   tran_low_t *outptr = out;
   2522   tran_low_t temp_in[32], temp_out[32];
   2523 
   2524   // Rows
   2525   for (i = 0; i < 32; ++i) {
   2526     tran_low_t zero_coeff = 0;
   2527     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
   2528 
   2529     if (zero_coeff)
   2530       highbd_idct32_c(input, outptr, bd);
   2531     else
   2532       memset(outptr, 0, sizeof(tran_low_t) * 32);
   2533     input += 32;
   2534     outptr += 32;
   2535   }
   2536 
   2537   // Columns
   2538   for (i = 0; i < 32; ++i) {
   2539     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   2540     highbd_idct32_c(temp_in, temp_out, bd);
   2541     for (j = 0; j < 32; ++j) {
   2542       dest[j * stride + i] = highbd_clip_pixel_add(
   2543           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2544     }
   2545   }
   2546 }
   2547 
   2548 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
   2549                                     int stride, int bd) {
   2550   int i, j;
   2551   tran_low_t out[32 * 32] = { 0 };
   2552   tran_low_t *outptr = out;
   2553   tran_low_t temp_in[32], temp_out[32];
   2554 
   2555   // Rows
   2556   // Only upper-left 16x16 has non-zero coeff
   2557   for (i = 0; i < 16; ++i) {
   2558     highbd_idct32_c(input, outptr, bd);
   2559     input += 32;
   2560     outptr += 32;
   2561   }
   2562 
   2563   // Columns
   2564   for (i = 0; i < 32; ++i) {
   2565     uint16_t *destT = dest;
   2566     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   2567     highbd_idct32_c(temp_in, temp_out, bd);
   2568     for (j = 0; j < 32; ++j) {
   2569       destT[i] = highbd_clip_pixel_add(destT[i],
   2570                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2571       destT += stride;
   2572     }
   2573   }
   2574 }
   2575 
   2576 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
   2577                                    int stride, int bd) {
   2578   int i, j;
   2579   tran_low_t out[32 * 32] = { 0 };
   2580   tran_low_t *outptr = out;
   2581   tran_low_t temp_in[32], temp_out[32];
   2582 
   2583   // Rows
   2584   // Only upper-left 8x8 has non-zero coeff
   2585   for (i = 0; i < 8; ++i) {
   2586     highbd_idct32_c(input, outptr, bd);
   2587     input += 32;
   2588     outptr += 32;
   2589   }
   2590 
   2591   // Columns
   2592   for (i = 0; i < 32; ++i) {
   2593     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   2594     highbd_idct32_c(temp_in, temp_out, bd);
   2595     for (j = 0; j < 32; ++j) {
   2596       dest[j * stride + i] = highbd_clip_pixel_add(
   2597           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2598     }
   2599   }
   2600 }
   2601 
   2602 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
   2603                                   int stride, int bd) {
   2604   int i, j;
   2605   int a1;
   2606   tran_low_t out =
   2607       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   2608 
   2609   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   2610   a1 = ROUND_POWER_OF_TWO(out, 6);
   2611 
   2612   for (j = 0; j < 32; ++j) {
   2613     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   2614     dest += stride;
   2615   }
   2616 }
   2617 
   2618 #endif  // CONFIG_VP9_HIGHBITDEPTH
   2619