Home | History | Annotate | Download | only in vpx_dsp
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <math.h>
     12 #include <stdlib.h>
     13 #include <string.h>
     14 
     15 #include "./vpx_dsp_rtcd.h"
     16 #include "vpx_dsp/inv_txfm.h"
     17 
     18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     19   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     20      0.5 shifts per pixel. */
     21   int i;
     22   tran_low_t output[16];
     23   tran_high_t a1, b1, c1, d1, e1;
     24   const tran_low_t *ip = input;
     25   tran_low_t *op = output;
     26 
     27   for (i = 0; i < 4; i++) {
     28     a1 = ip[0] >> UNIT_QUANT_SHIFT;
     29     c1 = ip[1] >> UNIT_QUANT_SHIFT;
     30     d1 = ip[2] >> UNIT_QUANT_SHIFT;
     31     b1 = ip[3] >> UNIT_QUANT_SHIFT;
     32     a1 += c1;
     33     d1 -= b1;
     34     e1 = (a1 - d1) >> 1;
     35     b1 = e1 - b1;
     36     c1 = e1 - c1;
     37     a1 -= b1;
     38     d1 += c1;
     39     op[0] = WRAPLOW(a1);
     40     op[1] = WRAPLOW(b1);
     41     op[2] = WRAPLOW(c1);
     42     op[3] = WRAPLOW(d1);
     43     ip += 4;
     44     op += 4;
     45   }
     46 
     47   ip = output;
     48   for (i = 0; i < 4; i++) {
     49     a1 = ip[4 * 0];
     50     c1 = ip[4 * 1];
     51     d1 = ip[4 * 2];
     52     b1 = ip[4 * 3];
     53     a1 += c1;
     54     d1 -= b1;
     55     e1 = (a1 - d1) >> 1;
     56     b1 = e1 - b1;
     57     c1 = e1 - c1;
     58     a1 -= b1;
     59     d1 += c1;
     60     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
     61     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
     62     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
     63     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
     64 
     65     ip++;
     66     dest++;
     67   }
     68 }
     69 
     70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
     71   int i;
     72   tran_high_t a1, e1;
     73   tran_low_t tmp[4];
     74   const tran_low_t *ip = in;
     75   tran_low_t *op = tmp;
     76 
     77   a1 = ip[0] >> UNIT_QUANT_SHIFT;
     78   e1 = a1 >> 1;
     79   a1 -= e1;
     80   op[0] = WRAPLOW(a1);
     81   op[1] = op[2] = op[3] = WRAPLOW(e1);
     82 
     83   ip = tmp;
     84   for (i = 0; i < 4; i++) {
     85     e1 = ip[0] >> 1;
     86     a1 = ip[0] - e1;
     87     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
     88     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
     89     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
     90     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
     91     ip++;
     92     dest++;
     93   }
     94 }
     95 
     96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
     97   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
     98   tran_low_t x0 = input[0];
     99   tran_low_t x1 = input[1];
    100   tran_low_t x2 = input[2];
    101   tran_low_t x3 = input[3];
    102 
    103   if (!(x0 | x1 | x2 | x3)) {
    104     memset(output, 0, 4 * sizeof(*output));
    105     return;
    106   }
    107 
    108   // 32-bit result is enough for the following multiplications.
    109   s0 = sinpi_1_9 * x0;
    110   s1 = sinpi_2_9 * x0;
    111   s2 = sinpi_3_9 * x1;
    112   s3 = sinpi_4_9 * x2;
    113   s4 = sinpi_1_9 * x2;
    114   s5 = sinpi_2_9 * x3;
    115   s6 = sinpi_4_9 * x3;
    116   s7 = WRAPLOW(x0 - x2 + x3);
    117 
    118   s0 = s0 + s3 + s5;
    119   s1 = s1 - s4 - s6;
    120   s3 = s2;
    121   s2 = sinpi_3_9 * s7;
    122 
    123   // 1-D transform scaling factor is sqrt(2).
    124   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
    125   // + 1b (addition) = 29b.
    126   // Hence the output bit depth is 15b.
    127   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
    128   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
    129   output[2] = WRAPLOW(dct_const_round_shift(s2));
    130   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
    131 }
    132 
    133 void idct4_c(const tran_low_t *input, tran_low_t *output) {
    134   int16_t step[4];
    135   tran_high_t temp1, temp2;
    136 
    137   // stage 1
    138   temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
    139   temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
    140   step[0] = WRAPLOW(dct_const_round_shift(temp1));
    141   step[1] = WRAPLOW(dct_const_round_shift(temp2));
    142   temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
    143   temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
    144   step[2] = WRAPLOW(dct_const_round_shift(temp1));
    145   step[3] = WRAPLOW(dct_const_round_shift(temp2));
    146 
    147   // stage 2
    148   output[0] = WRAPLOW(step[0] + step[3]);
    149   output[1] = WRAPLOW(step[1] + step[2]);
    150   output[2] = WRAPLOW(step[1] - step[2]);
    151   output[3] = WRAPLOW(step[0] - step[3]);
    152 }
    153 
    154 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    155   int i, j;
    156   tran_low_t out[4 * 4];
    157   tran_low_t *outptr = out;
    158   tran_low_t temp_in[4], temp_out[4];
    159 
    160   // Rows
    161   for (i = 0; i < 4; ++i) {
    162     idct4_c(input, outptr);
    163     input += 4;
    164     outptr += 4;
    165   }
    166 
    167   // Columns
    168   for (i = 0; i < 4; ++i) {
    169     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
    170     idct4_c(temp_in, temp_out);
    171     for (j = 0; j < 4; ++j) {
    172       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    173                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
    174     }
    175   }
    176 }
    177 
    178 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    179   int i;
    180   tran_high_t a1;
    181   tran_low_t out =
    182       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
    183 
    184   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    185   a1 = ROUND_POWER_OF_TWO(out, 4);
    186 
    187   for (i = 0; i < 4; i++) {
    188     dest[0] = clip_pixel_add(dest[0], a1);
    189     dest[1] = clip_pixel_add(dest[1], a1);
    190     dest[2] = clip_pixel_add(dest[2], a1);
    191     dest[3] = clip_pixel_add(dest[3], a1);
    192     dest += stride;
    193   }
    194 }
    195 
    196 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
    197   int s0, s1, s2, s3, s4, s5, s6, s7;
    198   tran_high_t x0 = input[7];
    199   tran_high_t x1 = input[0];
    200   tran_high_t x2 = input[5];
    201   tran_high_t x3 = input[2];
    202   tran_high_t x4 = input[3];
    203   tran_high_t x5 = input[4];
    204   tran_high_t x6 = input[1];
    205   tran_high_t x7 = input[6];
    206 
    207   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    208     memset(output, 0, 8 * sizeof(*output));
    209     return;
    210   }
    211 
    212   // stage 1
    213   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
    214   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
    215   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
    216   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
    217   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
    218   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
    219   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
    220   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
    221 
    222   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
    223   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
    224   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
    225   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
    226   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
    227   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
    228   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
    229   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
    230 
    231   // stage 2
    232   s0 = (int)x0;
    233   s1 = (int)x1;
    234   s2 = (int)x2;
    235   s3 = (int)x3;
    236   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
    237   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
    238   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
    239   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
    240 
    241   x0 = WRAPLOW(s0 + s2);
    242   x1 = WRAPLOW(s1 + s3);
    243   x2 = WRAPLOW(s0 - s2);
    244   x3 = WRAPLOW(s1 - s3);
    245   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
    246   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
    247   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
    248   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
    249 
    250   // stage 3
    251   s2 = (int)(cospi_16_64 * (x2 + x3));
    252   s3 = (int)(cospi_16_64 * (x2 - x3));
    253   s6 = (int)(cospi_16_64 * (x6 + x7));
    254   s7 = (int)(cospi_16_64 * (x6 - x7));
    255 
    256   x2 = WRAPLOW(dct_const_round_shift(s2));
    257   x3 = WRAPLOW(dct_const_round_shift(s3));
    258   x6 = WRAPLOW(dct_const_round_shift(s6));
    259   x7 = WRAPLOW(dct_const_round_shift(s7));
    260 
    261   output[0] = WRAPLOW(x0);
    262   output[1] = WRAPLOW(-x4);
    263   output[2] = WRAPLOW(x6);
    264   output[3] = WRAPLOW(-x2);
    265   output[4] = WRAPLOW(x3);
    266   output[5] = WRAPLOW(-x7);
    267   output[6] = WRAPLOW(x5);
    268   output[7] = WRAPLOW(-x1);
    269 }
    270 
    271 void idct8_c(const tran_low_t *input, tran_low_t *output) {
    272   int16_t step1[8], step2[8];
    273   tran_high_t temp1, temp2;
    274 
    275   // stage 1
    276   step1[0] = (int16_t)input[0];
    277   step1[2] = (int16_t)input[4];
    278   step1[1] = (int16_t)input[2];
    279   step1[3] = (int16_t)input[6];
    280   temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
    281   temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
    282   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
    283   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
    284   temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
    285   temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
    286   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    287   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    288 
    289   // stage 2
    290   temp1 = (step1[0] + step1[2]) * cospi_16_64;
    291   temp2 = (step1[0] - step1[2]) * cospi_16_64;
    292   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
    293   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
    294   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
    295   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
    296   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
    297   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
    298   step2[4] = WRAPLOW(step1[4] + step1[5]);
    299   step2[5] = WRAPLOW(step1[4] - step1[5]);
    300   step2[6] = WRAPLOW(-step1[6] + step1[7]);
    301   step2[7] = WRAPLOW(step1[6] + step1[7]);
    302 
    303   // stage 3
    304   step1[0] = WRAPLOW(step2[0] + step2[3]);
    305   step1[1] = WRAPLOW(step2[1] + step2[2]);
    306   step1[2] = WRAPLOW(step2[1] - step2[2]);
    307   step1[3] = WRAPLOW(step2[0] - step2[3]);
    308   step1[4] = step2[4];
    309   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    310   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    311   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    312   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    313   step1[7] = step2[7];
    314 
    315   // stage 4
    316   output[0] = WRAPLOW(step1[0] + step1[7]);
    317   output[1] = WRAPLOW(step1[1] + step1[6]);
    318   output[2] = WRAPLOW(step1[2] + step1[5]);
    319   output[3] = WRAPLOW(step1[3] + step1[4]);
    320   output[4] = WRAPLOW(step1[3] - step1[4]);
    321   output[5] = WRAPLOW(step1[2] - step1[5]);
    322   output[6] = WRAPLOW(step1[1] - step1[6]);
    323   output[7] = WRAPLOW(step1[0] - step1[7]);
    324 }
    325 
    326 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    327   int i, j;
    328   tran_low_t out[8 * 8];
    329   tran_low_t *outptr = out;
    330   tran_low_t temp_in[8], temp_out[8];
    331 
    332   // First transform rows
    333   for (i = 0; i < 8; ++i) {
    334     idct8_c(input, outptr);
    335     input += 8;
    336     outptr += 8;
    337   }
    338 
    339   // Then transform columns
    340   for (i = 0; i < 8; ++i) {
    341     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    342     idct8_c(temp_in, temp_out);
    343     for (j = 0; j < 8; ++j) {
    344       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    345                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
    346     }
    347   }
    348 }
    349 
    350 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    351   int i, j;
    352   tran_low_t out[8 * 8] = { 0 };
    353   tran_low_t *outptr = out;
    354   tran_low_t temp_in[8], temp_out[8];
    355 
    356   // First transform rows
    357   // Only first 4 row has non-zero coefs
    358   for (i = 0; i < 4; ++i) {
    359     idct8_c(input, outptr);
    360     input += 8;
    361     outptr += 8;
    362   }
    363 
    364   // Then transform columns
    365   for (i = 0; i < 8; ++i) {
    366     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    367     idct8_c(temp_in, temp_out);
    368     for (j = 0; j < 8; ++j) {
    369       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    370                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
    371     }
    372   }
    373 }
    374 
    375 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    376   int i, j;
    377   tran_high_t a1;
    378   tran_low_t out =
    379       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
    380 
    381   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    382   a1 = ROUND_POWER_OF_TWO(out, 5);
    383   for (j = 0; j < 8; ++j) {
    384     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    385     dest += stride;
    386   }
    387 }
    388 
    389 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
    390   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    391   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    392   tran_high_t x0 = input[15];
    393   tran_high_t x1 = input[0];
    394   tran_high_t x2 = input[13];
    395   tran_high_t x3 = input[2];
    396   tran_high_t x4 = input[11];
    397   tran_high_t x5 = input[4];
    398   tran_high_t x6 = input[9];
    399   tran_high_t x7 = input[6];
    400   tran_high_t x8 = input[7];
    401   tran_high_t x9 = input[8];
    402   tran_high_t x10 = input[5];
    403   tran_high_t x11 = input[10];
    404   tran_high_t x12 = input[3];
    405   tran_high_t x13 = input[12];
    406   tran_high_t x14 = input[1];
    407   tran_high_t x15 = input[14];
    408 
    409   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
    410         x13 | x14 | x15)) {
    411     memset(output, 0, 16 * sizeof(*output));
    412     return;
    413   }
    414 
    415   // stage 1
    416   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
    417   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    418   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
    419   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    420   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
    421   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    422   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    423   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    424   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    425   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    426   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    427   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    428   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    429   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
    430   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    431   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
    432 
    433   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
    434   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
    435   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
    436   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
    437   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
    438   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
    439   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
    440   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
    441   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
    442   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
    443   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
    444   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
    445   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
    446   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
    447   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
    448   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
    449 
    450   // stage 2
    451   s0 = x0;
    452   s1 = x1;
    453   s2 = x2;
    454   s3 = x3;
    455   s4 = x4;
    456   s5 = x5;
    457   s6 = x6;
    458   s7 = x7;
    459   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
    460   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
    461   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
    462   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
    463   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
    464   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
    465   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
    466   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
    467 
    468   x0 = WRAPLOW(s0 + s4);
    469   x1 = WRAPLOW(s1 + s5);
    470   x2 = WRAPLOW(s2 + s6);
    471   x3 = WRAPLOW(s3 + s7);
    472   x4 = WRAPLOW(s0 - s4);
    473   x5 = WRAPLOW(s1 - s5);
    474   x6 = WRAPLOW(s2 - s6);
    475   x7 = WRAPLOW(s3 - s7);
    476   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
    477   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
    478   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
    479   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
    480   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
    481   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
    482   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
    483   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
    484 
    485   // stage 3
    486   s0 = x0;
    487   s1 = x1;
    488   s2 = x2;
    489   s3 = x3;
    490   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
    491   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    492   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
    493   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
    494   s8 = x8;
    495   s9 = x9;
    496   s10 = x10;
    497   s11 = x11;
    498   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
    499   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    500   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
    501   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
    502 
    503   x0 = WRAPLOW(s0 + s2);
    504   x1 = WRAPLOW(s1 + s3);
    505   x2 = WRAPLOW(s0 - s2);
    506   x3 = WRAPLOW(s1 - s3);
    507   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
    508   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
    509   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
    510   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
    511   x8 = WRAPLOW(s8 + s10);
    512   x9 = WRAPLOW(s9 + s11);
    513   x10 = WRAPLOW(s8 - s10);
    514   x11 = WRAPLOW(s9 - s11);
    515   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
    516   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
    517   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
    518   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
    519 
    520   // stage 4
    521   s2 = (-cospi_16_64) * (x2 + x3);
    522   s3 = cospi_16_64 * (x2 - x3);
    523   s6 = cospi_16_64 * (x6 + x7);
    524   s7 = cospi_16_64 * (-x6 + x7);
    525   s10 = cospi_16_64 * (x10 + x11);
    526   s11 = cospi_16_64 * (-x10 + x11);
    527   s14 = (-cospi_16_64) * (x14 + x15);
    528   s15 = cospi_16_64 * (x14 - x15);
    529 
    530   x2 = WRAPLOW(dct_const_round_shift(s2));
    531   x3 = WRAPLOW(dct_const_round_shift(s3));
    532   x6 = WRAPLOW(dct_const_round_shift(s6));
    533   x7 = WRAPLOW(dct_const_round_shift(s7));
    534   x10 = WRAPLOW(dct_const_round_shift(s10));
    535   x11 = WRAPLOW(dct_const_round_shift(s11));
    536   x14 = WRAPLOW(dct_const_round_shift(s14));
    537   x15 = WRAPLOW(dct_const_round_shift(s15));
    538 
    539   output[0] = WRAPLOW(x0);
    540   output[1] = WRAPLOW(-x8);
    541   output[2] = WRAPLOW(x12);
    542   output[3] = WRAPLOW(-x4);
    543   output[4] = WRAPLOW(x6);
    544   output[5] = WRAPLOW(x14);
    545   output[6] = WRAPLOW(x10);
    546   output[7] = WRAPLOW(x2);
    547   output[8] = WRAPLOW(x3);
    548   output[9] = WRAPLOW(x11);
    549   output[10] = WRAPLOW(x15);
    550   output[11] = WRAPLOW(x7);
    551   output[12] = WRAPLOW(x5);
    552   output[13] = WRAPLOW(-x13);
    553   output[14] = WRAPLOW(x9);
    554   output[15] = WRAPLOW(-x1);
    555 }
    556 
    557 void idct16_c(const tran_low_t *input, tran_low_t *output) {
    558   int16_t step1[16], step2[16];
    559   tran_high_t temp1, temp2;
    560 
    561   // stage 1
    562   step1[0] = (int16_t)input[0 / 2];
    563   step1[1] = (int16_t)input[16 / 2];
    564   step1[2] = (int16_t)input[8 / 2];
    565   step1[3] = (int16_t)input[24 / 2];
    566   step1[4] = (int16_t)input[4 / 2];
    567   step1[5] = (int16_t)input[20 / 2];
    568   step1[6] = (int16_t)input[12 / 2];
    569   step1[7] = (int16_t)input[28 / 2];
    570   step1[8] = (int16_t)input[2 / 2];
    571   step1[9] = (int16_t)input[18 / 2];
    572   step1[10] = (int16_t)input[10 / 2];
    573   step1[11] = (int16_t)input[26 / 2];
    574   step1[12] = (int16_t)input[6 / 2];
    575   step1[13] = (int16_t)input[22 / 2];
    576   step1[14] = (int16_t)input[14 / 2];
    577   step1[15] = (int16_t)input[30 / 2];
    578 
    579   // stage 2
    580   step2[0] = step1[0];
    581   step2[1] = step1[1];
    582   step2[2] = step1[2];
    583   step2[3] = step1[3];
    584   step2[4] = step1[4];
    585   step2[5] = step1[5];
    586   step2[6] = step1[6];
    587   step2[7] = step1[7];
    588 
    589   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    590   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    591   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
    592   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
    593 
    594   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    595   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    596   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    597   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    598 
    599   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    600   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    601   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    602   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    603 
    604   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    605   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    606   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    607   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    608 
    609   // stage 3
    610   step1[0] = step2[0];
    611   step1[1] = step2[1];
    612   step1[2] = step2[2];
    613   step1[3] = step2[3];
    614 
    615   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    616   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    617   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
    618   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
    619   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    620   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    621   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    622   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    623 
    624   step1[8] = WRAPLOW(step2[8] + step2[9]);
    625   step1[9] = WRAPLOW(step2[8] - step2[9]);
    626   step1[10] = WRAPLOW(-step2[10] + step2[11]);
    627   step1[11] = WRAPLOW(step2[10] + step2[11]);
    628   step1[12] = WRAPLOW(step2[12] + step2[13]);
    629   step1[13] = WRAPLOW(step2[12] - step2[13]);
    630   step1[14] = WRAPLOW(-step2[14] + step2[15]);
    631   step1[15] = WRAPLOW(step2[14] + step2[15]);
    632 
    633   // stage 4
    634   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    635   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    636   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
    637   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
    638   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    639   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    640   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
    641   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
    642   step2[4] = WRAPLOW(step1[4] + step1[5]);
    643   step2[5] = WRAPLOW(step1[4] - step1[5]);
    644   step2[6] = WRAPLOW(-step1[6] + step1[7]);
    645   step2[7] = WRAPLOW(step1[6] + step1[7]);
    646 
    647   step2[8] = step1[8];
    648   step2[15] = step1[15];
    649   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    650   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    651   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    652   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    653   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    654   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    655   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    656   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    657   step2[11] = step1[11];
    658   step2[12] = step1[12];
    659 
    660   // stage 5
    661   step1[0] = WRAPLOW(step2[0] + step2[3]);
    662   step1[1] = WRAPLOW(step2[1] + step2[2]);
    663   step1[2] = WRAPLOW(step2[1] - step2[2]);
    664   step1[3] = WRAPLOW(step2[0] - step2[3]);
    665   step1[4] = step2[4];
    666   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    667   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    668   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    669   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    670   step1[7] = step2[7];
    671 
    672   step1[8] = WRAPLOW(step2[8] + step2[11]);
    673   step1[9] = WRAPLOW(step2[9] + step2[10]);
    674   step1[10] = WRAPLOW(step2[9] - step2[10]);
    675   step1[11] = WRAPLOW(step2[8] - step2[11]);
    676   step1[12] = WRAPLOW(-step2[12] + step2[15]);
    677   step1[13] = WRAPLOW(-step2[13] + step2[14]);
    678   step1[14] = WRAPLOW(step2[13] + step2[14]);
    679   step1[15] = WRAPLOW(step2[12] + step2[15]);
    680 
    681   // stage 6
    682   step2[0] = WRAPLOW(step1[0] + step1[7]);
    683   step2[1] = WRAPLOW(step1[1] + step1[6]);
    684   step2[2] = WRAPLOW(step1[2] + step1[5]);
    685   step2[3] = WRAPLOW(step1[3] + step1[4]);
    686   step2[4] = WRAPLOW(step1[3] - step1[4]);
    687   step2[5] = WRAPLOW(step1[2] - step1[5]);
    688   step2[6] = WRAPLOW(step1[1] - step1[6]);
    689   step2[7] = WRAPLOW(step1[0] - step1[7]);
    690   step2[8] = step1[8];
    691   step2[9] = step1[9];
    692   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    693   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    694   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    695   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    696   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    697   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    698   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    699   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    700   step2[14] = step1[14];
    701   step2[15] = step1[15];
    702 
    703   // stage 7
    704   output[0] = WRAPLOW(step2[0] + step2[15]);
    705   output[1] = WRAPLOW(step2[1] + step2[14]);
    706   output[2] = WRAPLOW(step2[2] + step2[13]);
    707   output[3] = WRAPLOW(step2[3] + step2[12]);
    708   output[4] = WRAPLOW(step2[4] + step2[11]);
    709   output[5] = WRAPLOW(step2[5] + step2[10]);
    710   output[6] = WRAPLOW(step2[6] + step2[9]);
    711   output[7] = WRAPLOW(step2[7] + step2[8]);
    712   output[8] = WRAPLOW(step2[7] - step2[8]);
    713   output[9] = WRAPLOW(step2[6] - step2[9]);
    714   output[10] = WRAPLOW(step2[5] - step2[10]);
    715   output[11] = WRAPLOW(step2[4] - step2[11]);
    716   output[12] = WRAPLOW(step2[3] - step2[12]);
    717   output[13] = WRAPLOW(step2[2] - step2[13]);
    718   output[14] = WRAPLOW(step2[1] - step2[14]);
    719   output[15] = WRAPLOW(step2[0] - step2[15]);
    720 }
    721 
    722 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
    723                              int stride) {
    724   int i, j;
    725   tran_low_t out[16 * 16];
    726   tran_low_t *outptr = out;
    727   tran_low_t temp_in[16], temp_out[16];
    728 
    729   // First transform rows
    730   for (i = 0; i < 16; ++i) {
    731     idct16_c(input, outptr);
    732     input += 16;
    733     outptr += 16;
    734   }
    735 
    736   // Then transform columns
    737   for (i = 0; i < 16; ++i) {
    738     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    739     idct16_c(temp_in, temp_out);
    740     for (j = 0; j < 16; ++j) {
    741       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    742                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    743     }
    744   }
    745 }
    746 
    747 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
    748                             int stride) {
    749   int i, j;
    750   tran_low_t out[16 * 16] = { 0 };
    751   tran_low_t *outptr = out;
    752   tran_low_t temp_in[16], temp_out[16];
    753 
    754   // First transform rows. Since all non-zero dct coefficients are in
    755   // upper-left 8x8 area, we only need to calculate first 8 rows here.
    756   for (i = 0; i < 8; ++i) {
    757     idct16_c(input, outptr);
    758     input += 16;
    759     outptr += 16;
    760   }
    761 
    762   // Then transform columns
    763   for (i = 0; i < 16; ++i) {
    764     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    765     idct16_c(temp_in, temp_out);
    766     for (j = 0; j < 16; ++j) {
    767       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    768                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    769     }
    770   }
    771 }
    772 
    773 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
    774                             int stride) {
    775   int i, j;
    776   tran_low_t out[16 * 16] = { 0 };
    777   tran_low_t *outptr = out;
    778   tran_low_t temp_in[16], temp_out[16];
    779 
    780   // First transform rows. Since all non-zero dct coefficients are in
    781   // upper-left 4x4 area, we only need to calculate first 4 rows here.
    782   for (i = 0; i < 4; ++i) {
    783     idct16_c(input, outptr);
    784     input += 16;
    785     outptr += 16;
    786   }
    787 
    788   // Then transform columns
    789   for (i = 0; i < 16; ++i) {
    790     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    791     idct16_c(temp_in, temp_out);
    792     for (j = 0; j < 16; ++j) {
    793       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    794                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    795     }
    796   }
    797 }
    798 
    799 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    800   int i, j;
    801   tran_high_t a1;
    802   tran_low_t out =
    803       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
    804 
    805   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    806   a1 = ROUND_POWER_OF_TWO(out, 6);
    807   for (j = 0; j < 16; ++j) {
    808     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    809     dest += stride;
    810   }
    811 }
    812 
    813 void idct32_c(const tran_low_t *input, tran_low_t *output) {
    814   int16_t step1[32], step2[32];
    815   tran_high_t temp1, temp2;
    816 
    817   // stage 1
    818   step1[0] = (int16_t)input[0];
    819   step1[1] = (int16_t)input[16];
    820   step1[2] = (int16_t)input[8];
    821   step1[3] = (int16_t)input[24];
    822   step1[4] = (int16_t)input[4];
    823   step1[5] = (int16_t)input[20];
    824   step1[6] = (int16_t)input[12];
    825   step1[7] = (int16_t)input[28];
    826   step1[8] = (int16_t)input[2];
    827   step1[9] = (int16_t)input[18];
    828   step1[10] = (int16_t)input[10];
    829   step1[11] = (int16_t)input[26];
    830   step1[12] = (int16_t)input[6];
    831   step1[13] = (int16_t)input[22];
    832   step1[14] = (int16_t)input[14];
    833   step1[15] = (int16_t)input[30];
    834 
    835   temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
    836   temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
    837   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
    838   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
    839 
    840   temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
    841   temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
    842   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
    843   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
    844 
    845   temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
    846   temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
    847   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
    848   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
    849 
    850   temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
    851   temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
    852   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
    853   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
    854 
    855   temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
    856   temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
    857   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
    858   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
    859 
    860   temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
    861   temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
    862   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    863   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    864 
    865   temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
    866   temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
    867   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
    868   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
    869 
    870   temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
    871   temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
    872   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
    873   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
    874 
    875   // stage 2
    876   step2[0] = step1[0];
    877   step2[1] = step1[1];
    878   step2[2] = step1[2];
    879   step2[3] = step1[3];
    880   step2[4] = step1[4];
    881   step2[5] = step1[5];
    882   step2[6] = step1[6];
    883   step2[7] = step1[7];
    884 
    885   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    886   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    887   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
    888   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
    889 
    890   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    891   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    892   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    893   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    894 
    895   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    896   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    897   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    898   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    899 
    900   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    901   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    902   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    903   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    904 
    905   step2[16] = WRAPLOW(step1[16] + step1[17]);
    906   step2[17] = WRAPLOW(step1[16] - step1[17]);
    907   step2[18] = WRAPLOW(-step1[18] + step1[19]);
    908   step2[19] = WRAPLOW(step1[18] + step1[19]);
    909   step2[20] = WRAPLOW(step1[20] + step1[21]);
    910   step2[21] = WRAPLOW(step1[20] - step1[21]);
    911   step2[22] = WRAPLOW(-step1[22] + step1[23]);
    912   step2[23] = WRAPLOW(step1[22] + step1[23]);
    913   step2[24] = WRAPLOW(step1[24] + step1[25]);
    914   step2[25] = WRAPLOW(step1[24] - step1[25]);
    915   step2[26] = WRAPLOW(-step1[26] + step1[27]);
    916   step2[27] = WRAPLOW(step1[26] + step1[27]);
    917   step2[28] = WRAPLOW(step1[28] + step1[29]);
    918   step2[29] = WRAPLOW(step1[28] - step1[29]);
    919   step2[30] = WRAPLOW(-step1[30] + step1[31]);
    920   step2[31] = WRAPLOW(step1[30] + step1[31]);
    921 
    922   // stage 3
    923   step1[0] = step2[0];
    924   step1[1] = step2[1];
    925   step1[2] = step2[2];
    926   step1[3] = step2[3];
    927 
    928   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    929   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    930   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
    931   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
    932   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    933   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    934   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    935   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    936 
    937   step1[8] = WRAPLOW(step2[8] + step2[9]);
    938   step1[9] = WRAPLOW(step2[8] - step2[9]);
    939   step1[10] = WRAPLOW(-step2[10] + step2[11]);
    940   step1[11] = WRAPLOW(step2[10] + step2[11]);
    941   step1[12] = WRAPLOW(step2[12] + step2[13]);
    942   step1[13] = WRAPLOW(step2[12] - step2[13]);
    943   step1[14] = WRAPLOW(-step2[14] + step2[15]);
    944   step1[15] = WRAPLOW(step2[14] + step2[15]);
    945 
    946   step1[16] = step2[16];
    947   step1[31] = step2[31];
    948   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
    949   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
    950   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
    951   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
    952   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
    953   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
    954   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
    955   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
    956   step1[19] = step2[19];
    957   step1[20] = step2[20];
    958   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
    959   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
    960   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    961   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    962   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
    963   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
    964   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
    965   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
    966   step1[23] = step2[23];
    967   step1[24] = step2[24];
    968   step1[27] = step2[27];
    969   step1[28] = step2[28];
    970 
    971   // stage 4
    972   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    973   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    974   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
    975   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
    976   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    977   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    978   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
    979   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
    980   step2[4] = WRAPLOW(step1[4] + step1[5]);
    981   step2[5] = WRAPLOW(step1[4] - step1[5]);
    982   step2[6] = WRAPLOW(-step1[6] + step1[7]);
    983   step2[7] = WRAPLOW(step1[6] + step1[7]);
    984 
    985   step2[8] = step1[8];
    986   step2[15] = step1[15];
    987   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    988   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    989   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
    990   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
    991   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    992   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    993   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    994   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    995   step2[11] = step1[11];
    996   step2[12] = step1[12];
    997 
    998   step2[16] = WRAPLOW(step1[16] + step1[19]);
    999   step2[17] = WRAPLOW(step1[17] + step1[18]);
   1000   step2[18] = WRAPLOW(step1[17] - step1[18]);
   1001   step2[19] = WRAPLOW(step1[16] - step1[19]);
   1002   step2[20] = WRAPLOW(-step1[20] + step1[23]);
   1003   step2[21] = WRAPLOW(-step1[21] + step1[22]);
   1004   step2[22] = WRAPLOW(step1[21] + step1[22]);
   1005   step2[23] = WRAPLOW(step1[20] + step1[23]);
   1006 
   1007   step2[24] = WRAPLOW(step1[24] + step1[27]);
   1008   step2[25] = WRAPLOW(step1[25] + step1[26]);
   1009   step2[26] = WRAPLOW(step1[25] - step1[26]);
   1010   step2[27] = WRAPLOW(step1[24] - step1[27]);
   1011   step2[28] = WRAPLOW(-step1[28] + step1[31]);
   1012   step2[29] = WRAPLOW(-step1[29] + step1[30]);
   1013   step2[30] = WRAPLOW(step1[29] + step1[30]);
   1014   step2[31] = WRAPLOW(step1[28] + step1[31]);
   1015 
   1016   // stage 5
   1017   step1[0] = WRAPLOW(step2[0] + step2[3]);
   1018   step1[1] = WRAPLOW(step2[1] + step2[2]);
   1019   step1[2] = WRAPLOW(step2[1] - step2[2]);
   1020   step1[3] = WRAPLOW(step2[0] - step2[3]);
   1021   step1[4] = step2[4];
   1022   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1023   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1024   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   1025   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   1026   step1[7] = step2[7];
   1027 
   1028   step1[8] = WRAPLOW(step2[8] + step2[11]);
   1029   step1[9] = WRAPLOW(step2[9] + step2[10]);
   1030   step1[10] = WRAPLOW(step2[9] - step2[10]);
   1031   step1[11] = WRAPLOW(step2[8] - step2[11]);
   1032   step1[12] = WRAPLOW(-step2[12] + step2[15]);
   1033   step1[13] = WRAPLOW(-step2[13] + step2[14]);
   1034   step1[14] = WRAPLOW(step2[13] + step2[14]);
   1035   step1[15] = WRAPLOW(step2[12] + step2[15]);
   1036 
   1037   step1[16] = step2[16];
   1038   step1[17] = step2[17];
   1039   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   1040   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
   1041   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   1042   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   1043   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   1044   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
   1045   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   1046   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   1047   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   1048   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
   1049   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   1050   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   1051   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   1052   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
   1053   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   1054   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   1055   step1[22] = step2[22];
   1056   step1[23] = step2[23];
   1057   step1[24] = step2[24];
   1058   step1[25] = step2[25];
   1059   step1[30] = step2[30];
   1060   step1[31] = step2[31];
   1061 
   1062   // stage 6
   1063   step2[0] = WRAPLOW(step1[0] + step1[7]);
   1064   step2[1] = WRAPLOW(step1[1] + step1[6]);
   1065   step2[2] = WRAPLOW(step1[2] + step1[5]);
   1066   step2[3] = WRAPLOW(step1[3] + step1[4]);
   1067   step2[4] = WRAPLOW(step1[3] - step1[4]);
   1068   step2[5] = WRAPLOW(step1[2] - step1[5]);
   1069   step2[6] = WRAPLOW(step1[1] - step1[6]);
   1070   step2[7] = WRAPLOW(step1[0] - step1[7]);
   1071   step2[8] = step1[8];
   1072   step2[9] = step1[9];
   1073   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   1074   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   1075   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
   1076   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   1077   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   1078   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   1079   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
   1080   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   1081   step2[14] = step1[14];
   1082   step2[15] = step1[15];
   1083 
   1084   step2[16] = WRAPLOW(step1[16] + step1[23]);
   1085   step2[17] = WRAPLOW(step1[17] + step1[22]);
   1086   step2[18] = WRAPLOW(step1[18] + step1[21]);
   1087   step2[19] = WRAPLOW(step1[19] + step1[20]);
   1088   step2[20] = WRAPLOW(step1[19] - step1[20]);
   1089   step2[21] = WRAPLOW(step1[18] - step1[21]);
   1090   step2[22] = WRAPLOW(step1[17] - step1[22]);
   1091   step2[23] = WRAPLOW(step1[16] - step1[23]);
   1092 
   1093   step2[24] = WRAPLOW(-step1[24] + step1[31]);
   1094   step2[25] = WRAPLOW(-step1[25] + step1[30]);
   1095   step2[26] = WRAPLOW(-step1[26] + step1[29]);
   1096   step2[27] = WRAPLOW(-step1[27] + step1[28]);
   1097   step2[28] = WRAPLOW(step1[27] + step1[28]);
   1098   step2[29] = WRAPLOW(step1[26] + step1[29]);
   1099   step2[30] = WRAPLOW(step1[25] + step1[30]);
   1100   step2[31] = WRAPLOW(step1[24] + step1[31]);
   1101 
   1102   // stage 7
   1103   step1[0] = WRAPLOW(step2[0] + step2[15]);
   1104   step1[1] = WRAPLOW(step2[1] + step2[14]);
   1105   step1[2] = WRAPLOW(step2[2] + step2[13]);
   1106   step1[3] = WRAPLOW(step2[3] + step2[12]);
   1107   step1[4] = WRAPLOW(step2[4] + step2[11]);
   1108   step1[5] = WRAPLOW(step2[5] + step2[10]);
   1109   step1[6] = WRAPLOW(step2[6] + step2[9]);
   1110   step1[7] = WRAPLOW(step2[7] + step2[8]);
   1111   step1[8] = WRAPLOW(step2[7] - step2[8]);
   1112   step1[9] = WRAPLOW(step2[6] - step2[9]);
   1113   step1[10] = WRAPLOW(step2[5] - step2[10]);
   1114   step1[11] = WRAPLOW(step2[4] - step2[11]);
   1115   step1[12] = WRAPLOW(step2[3] - step2[12]);
   1116   step1[13] = WRAPLOW(step2[2] - step2[13]);
   1117   step1[14] = WRAPLOW(step2[1] - step2[14]);
   1118   step1[15] = WRAPLOW(step2[0] - step2[15]);
   1119 
   1120   step1[16] = step2[16];
   1121   step1[17] = step2[17];
   1122   step1[18] = step2[18];
   1123   step1[19] = step2[19];
   1124   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   1125   temp2 = (step2[20] + step2[27]) * cospi_16_64;
   1126   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   1127   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   1128   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   1129   temp2 = (step2[21] + step2[26]) * cospi_16_64;
   1130   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   1131   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   1132   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   1133   temp2 = (step2[22] + step2[25]) * cospi_16_64;
   1134   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   1135   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   1136   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   1137   temp2 = (step2[23] + step2[24]) * cospi_16_64;
   1138   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   1139   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   1140   step1[28] = step2[28];
   1141   step1[29] = step2[29];
   1142   step1[30] = step2[30];
   1143   step1[31] = step2[31];
   1144 
   1145   // final stage
   1146   output[0] = WRAPLOW(step1[0] + step1[31]);
   1147   output[1] = WRAPLOW(step1[1] + step1[30]);
   1148   output[2] = WRAPLOW(step1[2] + step1[29]);
   1149   output[3] = WRAPLOW(step1[3] + step1[28]);
   1150   output[4] = WRAPLOW(step1[4] + step1[27]);
   1151   output[5] = WRAPLOW(step1[5] + step1[26]);
   1152   output[6] = WRAPLOW(step1[6] + step1[25]);
   1153   output[7] = WRAPLOW(step1[7] + step1[24]);
   1154   output[8] = WRAPLOW(step1[8] + step1[23]);
   1155   output[9] = WRAPLOW(step1[9] + step1[22]);
   1156   output[10] = WRAPLOW(step1[10] + step1[21]);
   1157   output[11] = WRAPLOW(step1[11] + step1[20]);
   1158   output[12] = WRAPLOW(step1[12] + step1[19]);
   1159   output[13] = WRAPLOW(step1[13] + step1[18]);
   1160   output[14] = WRAPLOW(step1[14] + step1[17]);
   1161   output[15] = WRAPLOW(step1[15] + step1[16]);
   1162   output[16] = WRAPLOW(step1[15] - step1[16]);
   1163   output[17] = WRAPLOW(step1[14] - step1[17]);
   1164   output[18] = WRAPLOW(step1[13] - step1[18]);
   1165   output[19] = WRAPLOW(step1[12] - step1[19]);
   1166   output[20] = WRAPLOW(step1[11] - step1[20]);
   1167   output[21] = WRAPLOW(step1[10] - step1[21]);
   1168   output[22] = WRAPLOW(step1[9] - step1[22]);
   1169   output[23] = WRAPLOW(step1[8] - step1[23]);
   1170   output[24] = WRAPLOW(step1[7] - step1[24]);
   1171   output[25] = WRAPLOW(step1[6] - step1[25]);
   1172   output[26] = WRAPLOW(step1[5] - step1[26]);
   1173   output[27] = WRAPLOW(step1[4] - step1[27]);
   1174   output[28] = WRAPLOW(step1[3] - step1[28]);
   1175   output[29] = WRAPLOW(step1[2] - step1[29]);
   1176   output[30] = WRAPLOW(step1[1] - step1[30]);
   1177   output[31] = WRAPLOW(step1[0] - step1[31]);
   1178 }
   1179 
   1180 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
   1181                               int stride) {
   1182   int i, j;
   1183   tran_low_t out[32 * 32];
   1184   tran_low_t *outptr = out;
   1185   tran_low_t temp_in[32], temp_out[32];
   1186 
   1187   // Rows
   1188   for (i = 0; i < 32; ++i) {
   1189     int16_t zero_coeff = 0;
   1190     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
   1191 
   1192     if (zero_coeff)
   1193       idct32_c(input, outptr);
   1194     else
   1195       memset(outptr, 0, sizeof(tran_low_t) * 32);
   1196     input += 32;
   1197     outptr += 32;
   1198   }
   1199 
   1200   // Columns
   1201   for (i = 0; i < 32; ++i) {
   1202     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   1203     idct32_c(temp_in, temp_out);
   1204     for (j = 0; j < 32; ++j) {
   1205       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1206                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1207     }
   1208   }
   1209 }
   1210 
   1211 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
   1212                              int stride) {
   1213   int i, j;
   1214   tran_low_t out[32 * 32] = { 0 };
   1215   tran_low_t *outptr = out;
   1216   tran_low_t temp_in[32], temp_out[32];
   1217 
   1218   // Rows
   1219   // Only upper-left 16x16 has non-zero coeff
   1220   for (i = 0; i < 16; ++i) {
   1221     idct32_c(input, outptr);
   1222     input += 32;
   1223     outptr += 32;
   1224   }
   1225 
   1226   // Columns
   1227   for (i = 0; i < 32; ++i) {
   1228     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   1229     idct32_c(temp_in, temp_out);
   1230     for (j = 0; j < 32; ++j) {
   1231       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1232                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1233     }
   1234   }
   1235 }
   1236 
   1237 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
   1238                             int stride) {
   1239   int i, j;
   1240   tran_low_t out[32 * 32] = { 0 };
   1241   tran_low_t *outptr = out;
   1242   tran_low_t temp_in[32], temp_out[32];
   1243 
   1244   // Rows
   1245   // Only upper-left 8x8 has non-zero coeff
   1246   for (i = 0; i < 8; ++i) {
   1247     idct32_c(input, outptr);
   1248     input += 32;
   1249     outptr += 32;
   1250   }
   1251 
   1252   // Columns
   1253   for (i = 0; i < 32; ++i) {
   1254     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   1255     idct32_c(temp_in, temp_out);
   1256     for (j = 0; j < 32; ++j) {
   1257       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
   1258                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
   1259     }
   1260   }
   1261 }
   1262 
   1263 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   1264   int i, j;
   1265   tran_high_t a1;
   1266   tran_low_t out =
   1267       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   1268 
   1269   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   1270   a1 = ROUND_POWER_OF_TWO(out, 6);
   1271 
   1272   for (j = 0; j < 32; ++j) {
   1273     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
   1274     dest += stride;
   1275   }
   1276 }
   1277 
   1278 #if CONFIG_VP9_HIGHBITDEPTH
   1279 
   1280 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
   1281 // transform amplify bits + 1 bit for contingency in rounding and quantizing
   1282 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
   1283 
   1284 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
   1285                                               int size) {
   1286   int i;
   1287   for (i = 0; i < size; ++i)
   1288     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
   1289   return 0;
   1290 }
   1291 
   1292 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
   1293                                  int stride, int bd) {
   1294   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
   1295      0.5 shifts per pixel. */
   1296   int i;
   1297   tran_low_t output[16];
   1298   tran_high_t a1, b1, c1, d1, e1;
   1299   const tran_low_t *ip = input;
   1300   tran_low_t *op = output;
   1301 
   1302   for (i = 0; i < 4; i++) {
   1303     a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1304     c1 = ip[1] >> UNIT_QUANT_SHIFT;
   1305     d1 = ip[2] >> UNIT_QUANT_SHIFT;
   1306     b1 = ip[3] >> UNIT_QUANT_SHIFT;
   1307     a1 += c1;
   1308     d1 -= b1;
   1309     e1 = (a1 - d1) >> 1;
   1310     b1 = e1 - b1;
   1311     c1 = e1 - c1;
   1312     a1 -= b1;
   1313     d1 += c1;
   1314     op[0] = HIGHBD_WRAPLOW(a1, bd);
   1315     op[1] = HIGHBD_WRAPLOW(b1, bd);
   1316     op[2] = HIGHBD_WRAPLOW(c1, bd);
   1317     op[3] = HIGHBD_WRAPLOW(d1, bd);
   1318     ip += 4;
   1319     op += 4;
   1320   }
   1321 
   1322   ip = output;
   1323   for (i = 0; i < 4; i++) {
   1324     a1 = ip[4 * 0];
   1325     c1 = ip[4 * 1];
   1326     d1 = ip[4 * 2];
   1327     b1 = ip[4 * 3];
   1328     a1 += c1;
   1329     d1 -= b1;
   1330     e1 = (a1 - d1) >> 1;
   1331     b1 = e1 - b1;
   1332     c1 = e1 - c1;
   1333     a1 -= b1;
   1334     d1 += c1;
   1335     dest[stride * 0] =
   1336         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
   1337     dest[stride * 1] =
   1338         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
   1339     dest[stride * 2] =
   1340         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
   1341     dest[stride * 3] =
   1342         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
   1343 
   1344     ip++;
   1345     dest++;
   1346   }
   1347 }
   1348 
   1349 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
   1350                                 int stride, int bd) {
   1351   int i;
   1352   tran_high_t a1, e1;
   1353   tran_low_t tmp[4];
   1354   const tran_low_t *ip = in;
   1355   tran_low_t *op = tmp;
   1356   (void)bd;
   1357 
   1358   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   1359   e1 = a1 >> 1;
   1360   a1 -= e1;
   1361   op[0] = HIGHBD_WRAPLOW(a1, bd);
   1362   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
   1363 
   1364   ip = tmp;
   1365   for (i = 0; i < 4; i++) {
   1366     e1 = ip[0] >> 1;
   1367     a1 = ip[0] - e1;
   1368     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
   1369     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
   1370     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
   1371     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
   1372     ip++;
   1373     dest++;
   1374   }
   1375 }
   1376 
   1377 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1378   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1379   tran_low_t x0 = input[0];
   1380   tran_low_t x1 = input[1];
   1381   tran_low_t x2 = input[2];
   1382   tran_low_t x3 = input[3];
   1383   (void)bd;
   1384 
   1385   if (detect_invalid_highbd_input(input, 4)) {
   1386 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1387     assert(0 && "invalid highbd txfm input");
   1388 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1389     memset(output, 0, sizeof(*output) * 4);
   1390     return;
   1391   }
   1392 
   1393   if (!(x0 | x1 | x2 | x3)) {
   1394     memset(output, 0, 4 * sizeof(*output));
   1395     return;
   1396   }
   1397 
   1398   s0 = (tran_high_t)sinpi_1_9 * x0;
   1399   s1 = (tran_high_t)sinpi_2_9 * x0;
   1400   s2 = (tran_high_t)sinpi_3_9 * x1;
   1401   s3 = (tran_high_t)sinpi_4_9 * x2;
   1402   s4 = (tran_high_t)sinpi_1_9 * x2;
   1403   s5 = (tran_high_t)sinpi_2_9 * x3;
   1404   s6 = (tran_high_t)sinpi_4_9 * x3;
   1405   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
   1406 
   1407   s0 = s0 + s3 + s5;
   1408   s1 = s1 - s4 - s6;
   1409   s3 = s2;
   1410   s2 = sinpi_3_9 * s7;
   1411 
   1412   // 1-D transform scaling factor is sqrt(2).
   1413   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   1414   // + 1b (addition) = 29b.
   1415   // Hence the output bit depth is 15b.
   1416   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
   1417   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
   1418   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   1419   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
   1420 }
   1421 
   1422 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1423   tran_low_t step[4];
   1424   tran_high_t temp1, temp2;
   1425   (void)bd;
   1426 
   1427   if (detect_invalid_highbd_input(input, 4)) {
   1428 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1429     assert(0 && "invalid highbd txfm input");
   1430 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1431     memset(output, 0, sizeof(*output) * 4);
   1432     return;
   1433   }
   1434 
   1435   // stage 1
   1436   temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
   1437   temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
   1438   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1439   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1440   temp1 =
   1441       input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
   1442   temp2 =
   1443       input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
   1444   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1445   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1446 
   1447   // stage 2
   1448   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
   1449   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
   1450   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
   1451   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
   1452 }
   1453 
   1454 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
   1455                                  int stride, int bd) {
   1456   int i, j;
   1457   tran_low_t out[4 * 4];
   1458   tran_low_t *outptr = out;
   1459   tran_low_t temp_in[4], temp_out[4];
   1460 
   1461   // Rows
   1462   for (i = 0; i < 4; ++i) {
   1463     vpx_highbd_idct4_c(input, outptr, bd);
   1464     input += 4;
   1465     outptr += 4;
   1466   }
   1467 
   1468   // Columns
   1469   for (i = 0; i < 4; ++i) {
   1470     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
   1471     vpx_highbd_idct4_c(temp_in, temp_out, bd);
   1472     for (j = 0; j < 4; ++j) {
   1473       dest[j * stride + i] = highbd_clip_pixel_add(
   1474           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
   1475     }
   1476   }
   1477 }
   1478 
   1479 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
   1480                                 int stride, int bd) {
   1481   int i;
   1482   tran_high_t a1;
   1483   tran_low_t out = HIGHBD_WRAPLOW(
   1484       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
   1485 
   1486   out =
   1487       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   1488   a1 = ROUND_POWER_OF_TWO(out, 4);
   1489 
   1490   for (i = 0; i < 4; i++) {
   1491     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
   1492     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
   1493     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
   1494     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
   1495     dest += stride;
   1496   }
   1497 }
   1498 
   1499 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1500   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   1501   tran_low_t x0 = input[7];
   1502   tran_low_t x1 = input[0];
   1503   tran_low_t x2 = input[5];
   1504   tran_low_t x3 = input[2];
   1505   tran_low_t x4 = input[3];
   1506   tran_low_t x5 = input[4];
   1507   tran_low_t x6 = input[1];
   1508   tran_low_t x7 = input[6];
   1509   (void)bd;
   1510 
   1511   if (detect_invalid_highbd_input(input, 8)) {
   1512 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1513     assert(0 && "invalid highbd txfm input");
   1514 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1515     memset(output, 0, sizeof(*output) * 8);
   1516     return;
   1517   }
   1518 
   1519   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
   1520     memset(output, 0, 8 * sizeof(*output));
   1521     return;
   1522   }
   1523 
   1524   // stage 1
   1525   s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
   1526   s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
   1527   s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
   1528   s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
   1529   s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
   1530   s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
   1531   s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
   1532   s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
   1533 
   1534   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
   1535   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
   1536   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
   1537   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
   1538   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
   1539   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
   1540   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
   1541   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
   1542 
   1543   // stage 2
   1544   s0 = x0;
   1545   s1 = x1;
   1546   s2 = x2;
   1547   s3 = x3;
   1548   s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
   1549   s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
   1550   s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
   1551   s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
   1552 
   1553   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   1554   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   1555   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   1556   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
   1557   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
   1558   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
   1559   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
   1560   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   1561 
   1562   // stage 3
   1563   s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
   1564   s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
   1565   s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
   1566   s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
   1567 
   1568   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   1569   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
   1570   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
   1571   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
   1572 
   1573   output[0] = HIGHBD_WRAPLOW(x0, bd);
   1574   output[1] = HIGHBD_WRAPLOW(-x4, bd);
   1575   output[2] = HIGHBD_WRAPLOW(x6, bd);
   1576   output[3] = HIGHBD_WRAPLOW(-x2, bd);
   1577   output[4] = HIGHBD_WRAPLOW(x3, bd);
   1578   output[5] = HIGHBD_WRAPLOW(-x7, bd);
   1579   output[6] = HIGHBD_WRAPLOW(x5, bd);
   1580   output[7] = HIGHBD_WRAPLOW(-x1, bd);
   1581 }
   1582 
   1583 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1584   tran_low_t step1[8], step2[8];
   1585   tran_high_t temp1, temp2;
   1586 
   1587   if (detect_invalid_highbd_input(input, 8)) {
   1588 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1589     assert(0 && "invalid highbd txfm input");
   1590 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1591     memset(output, 0, sizeof(*output) * 8);
   1592     return;
   1593   }
   1594 
   1595   // stage 1
   1596   step1[0] = input[0];
   1597   step1[2] = input[4];
   1598   step1[1] = input[2];
   1599   step1[3] = input[6];
   1600   temp1 =
   1601       input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
   1602   temp2 =
   1603       input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
   1604   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1605   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1606   temp1 =
   1607       input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
   1608   temp2 =
   1609       input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
   1610   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1611   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1612 
   1613   // stage 2 & stage 3 - even half
   1614   vpx_highbd_idct4_c(step1, step1, bd);
   1615 
   1616   // stage 2 - odd half
   1617   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   1618   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   1619   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
   1620   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
   1621 
   1622   // stage 3 - odd half
   1623   step1[4] = step2[4];
   1624   temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
   1625   temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   1626   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1627   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1628   step1[7] = step2[7];
   1629 
   1630   // stage 4
   1631   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
   1632   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
   1633   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
   1634   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
   1635   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
   1636   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
   1637   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
   1638   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   1639 }
   1640 
   1641 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
   1642                                  int stride, int bd) {
   1643   int i, j;
   1644   tran_low_t out[8 * 8];
   1645   tran_low_t *outptr = out;
   1646   tran_low_t temp_in[8], temp_out[8];
   1647 
   1648   // First transform rows
   1649   for (i = 0; i < 8; ++i) {
   1650     vpx_highbd_idct8_c(input, outptr, bd);
   1651     input += 8;
   1652     outptr += 8;
   1653   }
   1654 
   1655   // Then transform columns
   1656   for (i = 0; i < 8; ++i) {
   1657     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
   1658     vpx_highbd_idct8_c(temp_in, temp_out, bd);
   1659     for (j = 0; j < 8; ++j) {
   1660       dest[j * stride + i] = highbd_clip_pixel_add(
   1661           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1662     }
   1663   }
   1664 }
   1665 
   1666 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
   1667                                  int stride, int bd) {
   1668   int i, j;
   1669   tran_low_t out[8 * 8] = { 0 };
   1670   tran_low_t *outptr = out;
   1671   tran_low_t temp_in[8], temp_out[8];
   1672 
   1673   // First transform rows
   1674   // Only first 4 row has non-zero coefs
   1675   for (i = 0; i < 4; ++i) {
   1676     vpx_highbd_idct8_c(input, outptr, bd);
   1677     input += 8;
   1678     outptr += 8;
   1679   }
   1680 
   1681   // Then transform columns
   1682   for (i = 0; i < 8; ++i) {
   1683     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
   1684     vpx_highbd_idct8_c(temp_in, temp_out, bd);
   1685     for (j = 0; j < 8; ++j) {
   1686       dest[j * stride + i] = highbd_clip_pixel_add(
   1687           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
   1688     }
   1689   }
   1690 }
   1691 
   1692 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
   1693                                 int stride, int bd) {
   1694   int i, j;
   1695   tran_high_t a1;
   1696   tran_low_t out = HIGHBD_WRAPLOW(
   1697       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
   1698 
   1699   out =
   1700       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   1701   a1 = ROUND_POWER_OF_TWO(out, 5);
   1702   for (j = 0; j < 8; ++j) {
   1703     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   1704     dest += stride;
   1705   }
   1706 }
   1707 
   1708 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1709   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   1710   tran_high_t s9, s10, s11, s12, s13, s14, s15;
   1711   tran_low_t x0 = input[15];
   1712   tran_low_t x1 = input[0];
   1713   tran_low_t x2 = input[13];
   1714   tran_low_t x3 = input[2];
   1715   tran_low_t x4 = input[11];
   1716   tran_low_t x5 = input[4];
   1717   tran_low_t x6 = input[9];
   1718   tran_low_t x7 = input[6];
   1719   tran_low_t x8 = input[7];
   1720   tran_low_t x9 = input[8];
   1721   tran_low_t x10 = input[5];
   1722   tran_low_t x11 = input[10];
   1723   tran_low_t x12 = input[3];
   1724   tran_low_t x13 = input[12];
   1725   tran_low_t x14 = input[1];
   1726   tran_low_t x15 = input[14];
   1727   (void)bd;
   1728 
   1729   if (detect_invalid_highbd_input(input, 16)) {
   1730 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1731     assert(0 && "invalid highbd txfm input");
   1732 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1733     memset(output, 0, sizeof(*output) * 16);
   1734     return;
   1735   }
   1736 
   1737   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
   1738         x13 | x14 | x15)) {
   1739     memset(output, 0, 16 * sizeof(*output));
   1740     return;
   1741   }
   1742 
   1743   // stage 1
   1744   s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
   1745   s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
   1746   s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
   1747   s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
   1748   s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
   1749   s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
   1750   s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
   1751   s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
   1752   s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
   1753   s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
   1754   s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
   1755   s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
   1756   s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
   1757   s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
   1758   s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
   1759   s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
   1760 
   1761   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
   1762   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
   1763   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
   1764   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
   1765   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
   1766   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
   1767   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
   1768   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
   1769   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
   1770   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
   1771   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
   1772   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
   1773   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
   1774   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
   1775   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
   1776   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
   1777 
   1778   // stage 2
   1779   s0 = x0;
   1780   s1 = x1;
   1781   s2 = x2;
   1782   s3 = x3;
   1783   s4 = x4;
   1784   s5 = x5;
   1785   s6 = x6;
   1786   s7 = x7;
   1787   s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
   1788   s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
   1789   s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
   1790   s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
   1791   s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
   1792   s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
   1793   s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
   1794   s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
   1795 
   1796   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
   1797   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
   1798   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
   1799   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
   1800   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
   1801   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
   1802   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
   1803   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
   1804   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
   1805   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
   1806   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
   1807   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
   1808   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
   1809   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
   1810   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
   1811   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
   1812 
   1813   // stage 3
   1814   s0 = x0;
   1815   s1 = x1;
   1816   s2 = x2;
   1817   s3 = x3;
   1818   s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
   1819   s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
   1820   s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
   1821   s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
   1822   s8 = x8;
   1823   s9 = x9;
   1824   s10 = x10;
   1825   s11 = x11;
   1826   s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
   1827   s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
   1828   s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
   1829   s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
   1830 
   1831   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   1832   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   1833   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   1834   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
   1835   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
   1836   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
   1837   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
   1838   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   1839   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
   1840   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
   1841   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
   1842   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
   1843   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
   1844   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
   1845   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
   1846   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
   1847 
   1848   // stage 4
   1849   s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
   1850   s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
   1851   s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
   1852   s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
   1853   s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
   1854   s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
   1855   s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
   1856   s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
   1857 
   1858   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   1859   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
   1860   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
   1861   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
   1862   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
   1863   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
   1864   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
   1865   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
   1866 
   1867   output[0] = HIGHBD_WRAPLOW(x0, bd);
   1868   output[1] = HIGHBD_WRAPLOW(-x8, bd);
   1869   output[2] = HIGHBD_WRAPLOW(x12, bd);
   1870   output[3] = HIGHBD_WRAPLOW(-x4, bd);
   1871   output[4] = HIGHBD_WRAPLOW(x6, bd);
   1872   output[5] = HIGHBD_WRAPLOW(x14, bd);
   1873   output[6] = HIGHBD_WRAPLOW(x10, bd);
   1874   output[7] = HIGHBD_WRAPLOW(x2, bd);
   1875   output[8] = HIGHBD_WRAPLOW(x3, bd);
   1876   output[9] = HIGHBD_WRAPLOW(x11, bd);
   1877   output[10] = HIGHBD_WRAPLOW(x15, bd);
   1878   output[11] = HIGHBD_WRAPLOW(x7, bd);
   1879   output[12] = HIGHBD_WRAPLOW(x5, bd);
   1880   output[13] = HIGHBD_WRAPLOW(-x13, bd);
   1881   output[14] = HIGHBD_WRAPLOW(x9, bd);
   1882   output[15] = HIGHBD_WRAPLOW(-x1, bd);
   1883 }
   1884 
   1885 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   1886   tran_low_t step1[16], step2[16];
   1887   tran_high_t temp1, temp2;
   1888   (void)bd;
   1889 
   1890   if (detect_invalid_highbd_input(input, 16)) {
   1891 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   1892     assert(0 && "invalid highbd txfm input");
   1893 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   1894     memset(output, 0, sizeof(*output) * 16);
   1895     return;
   1896   }
   1897 
   1898   // stage 1
   1899   step1[0] = input[0 / 2];
   1900   step1[1] = input[16 / 2];
   1901   step1[2] = input[8 / 2];
   1902   step1[3] = input[24 / 2];
   1903   step1[4] = input[4 / 2];
   1904   step1[5] = input[20 / 2];
   1905   step1[6] = input[12 / 2];
   1906   step1[7] = input[28 / 2];
   1907   step1[8] = input[2 / 2];
   1908   step1[9] = input[18 / 2];
   1909   step1[10] = input[10 / 2];
   1910   step1[11] = input[26 / 2];
   1911   step1[12] = input[6 / 2];
   1912   step1[13] = input[22 / 2];
   1913   step1[14] = input[14 / 2];
   1914   step1[15] = input[30 / 2];
   1915 
   1916   // stage 2
   1917   step2[0] = step1[0];
   1918   step2[1] = step1[1];
   1919   step2[2] = step1[2];
   1920   step2[3] = step1[3];
   1921   step2[4] = step1[4];
   1922   step2[5] = step1[5];
   1923   step2[6] = step1[6];
   1924   step2[7] = step1[7];
   1925 
   1926   temp1 =
   1927       step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
   1928   temp2 =
   1929       step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
   1930   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1931   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1932 
   1933   temp1 = step1[9] * (tran_high_t)cospi_14_64 -
   1934           step1[14] * (tran_high_t)cospi_18_64;
   1935   temp2 = step1[9] * (tran_high_t)cospi_18_64 +
   1936           step1[14] * (tran_high_t)cospi_14_64;
   1937   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1938   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1939 
   1940   temp1 = step1[10] * (tran_high_t)cospi_22_64 -
   1941           step1[13] * (tran_high_t)cospi_10_64;
   1942   temp2 = step1[10] * (tran_high_t)cospi_10_64 +
   1943           step1[13] * (tran_high_t)cospi_22_64;
   1944   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1945   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1946 
   1947   temp1 = step1[11] * (tran_high_t)cospi_6_64 -
   1948           step1[12] * (tran_high_t)cospi_26_64;
   1949   temp2 = step1[11] * (tran_high_t)cospi_26_64 +
   1950           step1[12] * (tran_high_t)cospi_6_64;
   1951   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1952   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1953 
   1954   // stage 3
   1955   step1[0] = step2[0];
   1956   step1[1] = step2[1];
   1957   step1[2] = step2[2];
   1958   step1[3] = step2[3];
   1959 
   1960   temp1 =
   1961       step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
   1962   temp2 =
   1963       step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
   1964   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1965   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1966   temp1 =
   1967       step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
   1968   temp2 =
   1969       step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
   1970   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1971   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1972 
   1973   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   1974   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
   1975   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
   1976   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
   1977   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
   1978   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
   1979   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
   1980   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
   1981 
   1982   // stage 4
   1983   temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
   1984   temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
   1985   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1986   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1987   temp1 =
   1988       step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
   1989   temp2 =
   1990       step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
   1991   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   1992   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   1993   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   1994   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   1995   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
   1996   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
   1997 
   1998   step2[8] = step1[8];
   1999   step2[15] = step1[15];
   2000   temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
   2001           step1[14] * (tran_high_t)cospi_24_64;
   2002   temp2 =
   2003       step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
   2004   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2005   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2006   temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
   2007           step1[13] * (tran_high_t)cospi_8_64;
   2008   temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
   2009           step1[13] * (tran_high_t)cospi_24_64;
   2010   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2011   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2012   step2[11] = step1[11];
   2013   step2[12] = step1[12];
   2014 
   2015   // stage 5
   2016   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
   2017   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
   2018   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   2019   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   2020   step1[4] = step2[4];
   2021   temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
   2022   temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   2023   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2024   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2025   step1[7] = step2[7];
   2026 
   2027   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
   2028   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
   2029   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
   2030   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
   2031   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
   2032   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
   2033   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
   2034   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
   2035 
   2036   // stage 6
   2037   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
   2038   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
   2039   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
   2040   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
   2041   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
   2042   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
   2043   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
   2044   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   2045   step2[8] = step1[8];
   2046   step2[9] = step1[9];
   2047   temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   2048   temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   2049   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2050   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2051   temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   2052   temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   2053   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2054   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2055   step2[14] = step1[14];
   2056   step2[15] = step1[15];
   2057 
   2058   // stage 7
   2059   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
   2060   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
   2061   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
   2062   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
   2063   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
   2064   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
   2065   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
   2066   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
   2067   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
   2068   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
   2069   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
   2070   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
   2071   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
   2072   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
   2073   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
   2074   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
   2075 }
   2076 
   2077 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
   2078                                     int stride, int bd) {
   2079   int i, j;
   2080   tran_low_t out[16 * 16];
   2081   tran_low_t *outptr = out;
   2082   tran_low_t temp_in[16], temp_out[16];
   2083 
   2084   // First transform rows
   2085   for (i = 0; i < 16; ++i) {
   2086     vpx_highbd_idct16_c(input, outptr, bd);
   2087     input += 16;
   2088     outptr += 16;
   2089   }
   2090 
   2091   // Then transform columns
   2092   for (i = 0; i < 16; ++i) {
   2093     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
   2094     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2095     for (j = 0; j < 16; ++j) {
   2096       dest[j * stride + i] = highbd_clip_pixel_add(
   2097           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2098     }
   2099   }
   2100 }
   2101 
   2102 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
   2103                                    int stride, int bd) {
   2104   int i, j;
   2105   tran_low_t out[16 * 16] = { 0 };
   2106   tran_low_t *outptr = out;
   2107   tran_low_t temp_in[16], temp_out[16];
   2108 
   2109   // First transform rows. Since all non-zero dct coefficients are in
   2110   // upper-left 8x8 area, we only need to calculate first 8 rows here.
   2111   for (i = 0; i < 8; ++i) {
   2112     vpx_highbd_idct16_c(input, outptr, bd);
   2113     input += 16;
   2114     outptr += 16;
   2115   }
   2116 
   2117   // Then transform columns
   2118   for (i = 0; i < 16; ++i) {
   2119     uint16_t *destT = dest;
   2120     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
   2121     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2122     for (j = 0; j < 16; ++j) {
   2123       destT[i] = highbd_clip_pixel_add(destT[i],
   2124                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2125       destT += stride;
   2126     }
   2127   }
   2128 }
   2129 
   2130 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
   2131                                    int stride, int bd) {
   2132   int i, j;
   2133   tran_low_t out[16 * 16] = { 0 };
   2134   tran_low_t *outptr = out;
   2135   tran_low_t temp_in[16], temp_out[16];
   2136 
   2137   // First transform rows. Since all non-zero dct coefficients are in
   2138   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   2139   for (i = 0; i < 4; ++i) {
   2140     vpx_highbd_idct16_c(input, outptr, bd);
   2141     input += 16;
   2142     outptr += 16;
   2143   }
   2144 
   2145   // Then transform columns
   2146   for (i = 0; i < 16; ++i) {
   2147     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
   2148     vpx_highbd_idct16_c(temp_in, temp_out, bd);
   2149     for (j = 0; j < 16; ++j) {
   2150       dest[j * stride + i] = highbd_clip_pixel_add(
   2151           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2152     }
   2153   }
   2154 }
   2155 
   2156 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
   2157                                   int stride, int bd) {
   2158   int i, j;
   2159   tran_high_t a1;
   2160   tran_low_t out = HIGHBD_WRAPLOW(
   2161       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
   2162 
   2163   out =
   2164       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   2165   a1 = ROUND_POWER_OF_TWO(out, 6);
   2166   for (j = 0; j < 16; ++j) {
   2167     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   2168     dest += stride;
   2169   }
   2170 }
   2171 
   2172 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   2173                             int bd) {
   2174   tran_low_t step1[32], step2[32];
   2175   tran_high_t temp1, temp2;
   2176   (void)bd;
   2177 
   2178   if (detect_invalid_highbd_input(input, 32)) {
   2179 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   2180     assert(0 && "invalid highbd txfm input");
   2181 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   2182     memset(output, 0, sizeof(*output) * 32);
   2183     return;
   2184   }
   2185 
   2186   // stage 1
   2187   step1[0] = input[0];
   2188   step1[1] = input[16];
   2189   step1[2] = input[8];
   2190   step1[3] = input[24];
   2191   step1[4] = input[4];
   2192   step1[5] = input[20];
   2193   step1[6] = input[12];
   2194   step1[7] = input[28];
   2195   step1[8] = input[2];
   2196   step1[9] = input[18];
   2197   step1[10] = input[10];
   2198   step1[11] = input[26];
   2199   step1[12] = input[6];
   2200   step1[13] = input[22];
   2201   step1[14] = input[14];
   2202   step1[15] = input[30];
   2203 
   2204   temp1 =
   2205       input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
   2206   temp2 =
   2207       input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
   2208   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2209   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2210 
   2211   temp1 = input[17] * (tran_high_t)cospi_15_64 -
   2212           input[15] * (tran_high_t)cospi_17_64;
   2213   temp2 = input[17] * (tran_high_t)cospi_17_64 +
   2214           input[15] * (tran_high_t)cospi_15_64;
   2215   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2216   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2217 
   2218   temp1 =
   2219       input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
   2220   temp2 =
   2221       input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
   2222   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2223   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2224 
   2225   temp1 =
   2226       input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
   2227   temp2 =
   2228       input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
   2229   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2230   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2231 
   2232   temp1 =
   2233       input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
   2234   temp2 =
   2235       input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
   2236   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2237   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2238 
   2239   temp1 = input[21] * (tran_high_t)cospi_11_64 -
   2240           input[11] * (tran_high_t)cospi_21_64;
   2241   temp2 = input[21] * (tran_high_t)cospi_21_64 +
   2242           input[11] * (tran_high_t)cospi_11_64;
   2243   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2244   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2245 
   2246   temp1 = input[13] * (tran_high_t)cospi_19_64 -
   2247           input[19] * (tran_high_t)cospi_13_64;
   2248   temp2 = input[13] * (tran_high_t)cospi_13_64 +
   2249           input[19] * (tran_high_t)cospi_19_64;
   2250   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2251   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2252 
   2253   temp1 =
   2254       input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
   2255   temp2 =
   2256       input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
   2257   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2258   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2259 
   2260   // stage 2
   2261   step2[0] = step1[0];
   2262   step2[1] = step1[1];
   2263   step2[2] = step1[2];
   2264   step2[3] = step1[3];
   2265   step2[4] = step1[4];
   2266   step2[5] = step1[5];
   2267   step2[6] = step1[6];
   2268   step2[7] = step1[7];
   2269 
   2270   temp1 =
   2271       step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
   2272   temp2 =
   2273       step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
   2274   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2275   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2276 
   2277   temp1 = step1[9] * (tran_high_t)cospi_14_64 -
   2278           step1[14] * (tran_high_t)cospi_18_64;
   2279   temp2 = step1[9] * (tran_high_t)cospi_18_64 +
   2280           step1[14] * (tran_high_t)cospi_14_64;
   2281   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2282   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2283 
   2284   temp1 = step1[10] * (tran_high_t)cospi_22_64 -
   2285           step1[13] * (tran_high_t)cospi_10_64;
   2286   temp2 = step1[10] * (tran_high_t)cospi_10_64 +
   2287           step1[13] * (tran_high_t)cospi_22_64;
   2288   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2289   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2290 
   2291   temp1 = step1[11] * (tran_high_t)cospi_6_64 -
   2292           step1[12] * (tran_high_t)cospi_26_64;
   2293   temp2 = step1[11] * (tran_high_t)cospi_26_64 +
   2294           step1[12] * (tran_high_t)cospi_6_64;
   2295   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2296   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2297 
   2298   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
   2299   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
   2300   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
   2301   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
   2302   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
   2303   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
   2304   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
   2305   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
   2306   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
   2307   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
   2308   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
   2309   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
   2310   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
   2311   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
   2312   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
   2313   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
   2314 
   2315   // stage 3
   2316   step1[0] = step2[0];
   2317   step1[1] = step2[1];
   2318   step1[2] = step2[2];
   2319   step1[3] = step2[3];
   2320 
   2321   temp1 =
   2322       step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
   2323   temp2 =
   2324       step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
   2325   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2326   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2327   temp1 =
   2328       step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
   2329   temp2 =
   2330       step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
   2331   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2332   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2333 
   2334   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   2335   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
   2336   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
   2337   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
   2338   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
   2339   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
   2340   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
   2341   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
   2342 
   2343   step1[16] = step2[16];
   2344   step1[31] = step2[31];
   2345   temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
   2346           step2[30] * (tran_high_t)cospi_28_64;
   2347   temp2 = step2[17] * (tran_high_t)cospi_28_64 +
   2348           step2[30] * (tran_high_t)cospi_4_64;
   2349   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2350   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2351   temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
   2352           step2[29] * (tran_high_t)cospi_4_64;
   2353   temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
   2354           step2[29] * (tran_high_t)cospi_28_64;
   2355   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2356   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2357   step1[19] = step2[19];
   2358   step1[20] = step2[20];
   2359   temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
   2360           step2[26] * (tran_high_t)cospi_12_64;
   2361   temp2 = step2[21] * (tran_high_t)cospi_12_64 +
   2362           step2[26] * (tran_high_t)cospi_20_64;
   2363   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2364   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2365   temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
   2366           step2[25] * (tran_high_t)cospi_20_64;
   2367   temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
   2368           step2[25] * (tran_high_t)cospi_12_64;
   2369   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2370   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2371   step1[23] = step2[23];
   2372   step1[24] = step2[24];
   2373   step1[27] = step2[27];
   2374   step1[28] = step2[28];
   2375 
   2376   // stage 4
   2377   temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
   2378   temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
   2379   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2380   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2381   temp1 =
   2382       step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
   2383   temp2 =
   2384       step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
   2385   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2386   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2387   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   2388   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   2389   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
   2390   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
   2391 
   2392   step2[8] = step1[8];
   2393   step2[15] = step1[15];
   2394   temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
   2395           step1[14] * (tran_high_t)cospi_24_64;
   2396   temp2 =
   2397       step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
   2398   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2399   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2400   temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
   2401           step1[13] * (tran_high_t)cospi_8_64;
   2402   temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
   2403           step1[13] * (tran_high_t)cospi_24_64;
   2404   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2405   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2406   step2[11] = step1[11];
   2407   step2[12] = step1[12];
   2408 
   2409   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
   2410   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
   2411   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
   2412   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
   2413   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
   2414   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
   2415   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
   2416   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
   2417 
   2418   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
   2419   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
   2420   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
   2421   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
   2422   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
   2423   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
   2424   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
   2425   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
   2426 
   2427   // stage 5
   2428   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
   2429   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
   2430   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   2431   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   2432   step1[4] = step2[4];
   2433   temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
   2434   temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   2435   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2436   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2437   step1[7] = step2[7];
   2438 
   2439   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
   2440   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
   2441   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
   2442   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
   2443   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
   2444   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
   2445   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
   2446   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
   2447 
   2448   step1[16] = step2[16];
   2449   step1[17] = step2[17];
   2450   temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
   2451           step2[29] * (tran_high_t)cospi_24_64;
   2452   temp2 = step2[18] * (tran_high_t)cospi_24_64 +
   2453           step2[29] * (tran_high_t)cospi_8_64;
   2454   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2455   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2456   temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
   2457           step2[28] * (tran_high_t)cospi_24_64;
   2458   temp2 = step2[19] * (tran_high_t)cospi_24_64 +
   2459           step2[28] * (tran_high_t)cospi_8_64;
   2460   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2461   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2462   temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
   2463           step2[27] * (tran_high_t)cospi_8_64;
   2464   temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
   2465           step2[27] * (tran_high_t)cospi_24_64;
   2466   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2467   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2468   temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
   2469           step2[26] * (tran_high_t)cospi_8_64;
   2470   temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
   2471           step2[26] * (tran_high_t)cospi_24_64;
   2472   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2473   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2474   step1[22] = step2[22];
   2475   step1[23] = step2[23];
   2476   step1[24] = step2[24];
   2477   step1[25] = step2[25];
   2478   step1[30] = step2[30];
   2479   step1[31] = step2[31];
   2480 
   2481   // stage 6
   2482   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
   2483   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
   2484   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
   2485   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
   2486   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
   2487   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
   2488   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
   2489   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   2490   step2[8] = step1[8];
   2491   step2[9] = step1[9];
   2492   temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   2493   temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   2494   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2495   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2496   temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   2497   temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   2498   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2499   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2500   step2[14] = step1[14];
   2501   step2[15] = step1[15];
   2502 
   2503   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
   2504   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
   2505   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
   2506   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
   2507   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
   2508   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
   2509   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
   2510   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
   2511 
   2512   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
   2513   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
   2514   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
   2515   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
   2516   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
   2517   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
   2518   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
   2519   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
   2520 
   2521   // stage 7
   2522   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
   2523   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
   2524   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
   2525   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
   2526   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
   2527   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
   2528   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
   2529   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
   2530   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
   2531   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
   2532   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
   2533   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
   2534   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
   2535   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
   2536   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
   2537   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
   2538 
   2539   step1[16] = step2[16];
   2540   step1[17] = step2[17];
   2541   step1[18] = step2[18];
   2542   step1[19] = step2[19];
   2543   temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
   2544   temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
   2545   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2546   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2547   temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
   2548   temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
   2549   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2550   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2551   temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
   2552   temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
   2553   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2554   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2555   temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
   2556   temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
   2557   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   2558   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   2559   step1[28] = step2[28];
   2560   step1[29] = step2[29];
   2561   step1[30] = step2[30];
   2562   step1[31] = step2[31];
   2563 
   2564   // final stage
   2565   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
   2566   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
   2567   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
   2568   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
   2569   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
   2570   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
   2571   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
   2572   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
   2573   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
   2574   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
   2575   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
   2576   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
   2577   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
   2578   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
   2579   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
   2580   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
   2581   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
   2582   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
   2583   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
   2584   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
   2585   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
   2586   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
   2587   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
   2588   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
   2589   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
   2590   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
   2591   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
   2592   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
   2593   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
   2594   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
   2595   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
   2596   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
   2597 }
   2598 
   2599 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
   2600                                      int stride, int bd) {
   2601   int i, j;
   2602   tran_low_t out[32 * 32];
   2603   tran_low_t *outptr = out;
   2604   tran_low_t temp_in[32], temp_out[32];
   2605 
   2606   // Rows
   2607   for (i = 0; i < 32; ++i) {
   2608     tran_low_t zero_coeff = 0;
   2609     for (j = 0; j < 32; ++j) zero_coeff |= input[j];
   2610 
   2611     if (zero_coeff)
   2612       highbd_idct32_c(input, outptr, bd);
   2613     else
   2614       memset(outptr, 0, sizeof(tran_low_t) * 32);
   2615     input += 32;
   2616     outptr += 32;
   2617   }
   2618 
   2619   // Columns
   2620   for (i = 0; i < 32; ++i) {
   2621     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   2622     highbd_idct32_c(temp_in, temp_out, bd);
   2623     for (j = 0; j < 32; ++j) {
   2624       dest[j * stride + i] = highbd_clip_pixel_add(
   2625           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2626     }
   2627   }
   2628 }
   2629 
   2630 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
   2631                                     int stride, int bd) {
   2632   int i, j;
   2633   tran_low_t out[32 * 32] = { 0 };
   2634   tran_low_t *outptr = out;
   2635   tran_low_t temp_in[32], temp_out[32];
   2636 
   2637   // Rows
   2638   // Only upper-left 16x16 has non-zero coeff
   2639   for (i = 0; i < 16; ++i) {
   2640     highbd_idct32_c(input, outptr, bd);
   2641     input += 32;
   2642     outptr += 32;
   2643   }
   2644 
   2645   // Columns
   2646   for (i = 0; i < 32; ++i) {
   2647     uint16_t *destT = dest;
   2648     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   2649     highbd_idct32_c(temp_in, temp_out, bd);
   2650     for (j = 0; j < 32; ++j) {
   2651       destT[i] = highbd_clip_pixel_add(destT[i],
   2652                                        ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2653       destT += stride;
   2654     }
   2655   }
   2656 }
   2657 
   2658 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
   2659                                    int stride, int bd) {
   2660   int i, j;
   2661   tran_low_t out[32 * 32] = { 0 };
   2662   tran_low_t *outptr = out;
   2663   tran_low_t temp_in[32], temp_out[32];
   2664 
   2665   // Rows
   2666   // Only upper-left 8x8 has non-zero coeff
   2667   for (i = 0; i < 8; ++i) {
   2668     highbd_idct32_c(input, outptr, bd);
   2669     input += 32;
   2670     outptr += 32;
   2671   }
   2672 
   2673   // Columns
   2674   for (i = 0; i < 32; ++i) {
   2675     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
   2676     highbd_idct32_c(temp_in, temp_out, bd);
   2677     for (j = 0; j < 32; ++j) {
   2678       dest[j * stride + i] = highbd_clip_pixel_add(
   2679           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
   2680     }
   2681   }
   2682 }
   2683 
   2684 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
   2685                                   int stride, int bd) {
   2686   int i, j;
   2687   int a1;
   2688   tran_low_t out = HIGHBD_WRAPLOW(
   2689       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
   2690 
   2691   out =
   2692       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   2693   a1 = ROUND_POWER_OF_TWO(out, 6);
   2694 
   2695   for (j = 0; j < 32; ++j) {
   2696     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
   2697     dest += stride;
   2698   }
   2699 }
   2700 
   2701 #endif  // CONFIG_VP9_HIGHBITDEPTH
   2702