Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include "config/aom_dsp_rtcd.h"
     13 #include "config/av1_rtcd.h"
     14 
     15 #include "av1/common/enums.h"
     16 #include "av1/common/av1_txfm.h"
     17 #include "av1/common/av1_inv_txfm1d.h"
     18 #include "av1/common/av1_inv_txfm1d_cfg.h"
     19 
     20 void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     21                                  int stride, int bd) {
     22   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     23      0.5 shifts per pixel. */
     24   int i;
     25   tran_low_t output[16];
     26   tran_low_t a1, b1, c1, d1, e1;
     27   const tran_low_t *ip = input;
     28   tran_low_t *op = output;
     29   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
     30 
     31   for (i = 0; i < 4; i++) {
     32     a1 = ip[0] >> UNIT_QUANT_SHIFT;
     33     c1 = ip[1] >> UNIT_QUANT_SHIFT;
     34     d1 = ip[2] >> UNIT_QUANT_SHIFT;
     35     b1 = ip[3] >> UNIT_QUANT_SHIFT;
     36     a1 += c1;
     37     d1 -= b1;
     38     e1 = (a1 - d1) >> 1;
     39     b1 = e1 - b1;
     40     c1 = e1 - c1;
     41     a1 -= b1;
     42     d1 += c1;
     43 
     44     op[0] = a1;
     45     op[1] = b1;
     46     op[2] = c1;
     47     op[3] = d1;
     48     ip += 4;
     49     op += 4;
     50   }
     51 
     52   ip = output;
     53   for (i = 0; i < 4; i++) {
     54     a1 = ip[4 * 0];
     55     c1 = ip[4 * 1];
     56     d1 = ip[4 * 2];
     57     b1 = ip[4 * 3];
     58     a1 += c1;
     59     d1 -= b1;
     60     e1 = (a1 - d1) >> 1;
     61     b1 = e1 - b1;
     62     c1 = e1 - c1;
     63     a1 -= b1;
     64     d1 += c1;
     65 
     66     range_check_value(a1, bd + 1);
     67     range_check_value(b1, bd + 1);
     68     range_check_value(c1, bd + 1);
     69     range_check_value(d1, bd + 1);
     70 
     71     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
     72     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
     73     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
     74     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
     75 
     76     ip++;
     77     dest++;
     78   }
     79 }
     80 
     81 void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
     82                                 int dest_stride, int bd) {
     83   int i;
     84   tran_low_t a1, e1;
     85   tran_low_t tmp[4];
     86   const tran_low_t *ip = in;
     87   tran_low_t *op = tmp;
     88   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
     89   (void)bd;
     90 
     91   a1 = ip[0] >> UNIT_QUANT_SHIFT;
     92   e1 = a1 >> 1;
     93   a1 -= e1;
     94   op[0] = a1;
     95   op[1] = op[2] = op[3] = e1;
     96 
     97   ip = tmp;
     98   for (i = 0; i < 4; i++) {
     99     e1 = ip[0] >> 1;
    100     a1 = ip[0] - e1;
    101     dest[dest_stride * 0] =
    102         highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
    103     dest[dest_stride * 1] =
    104         highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
    105     dest[dest_stride * 2] =
    106         highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
    107     dest[dest_stride * 3] =
    108         highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
    109     ip++;
    110     dest++;
    111   }
    112 }
    113 
    114 static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
    115   switch (txfm_type) {
    116     case TXFM_TYPE_DCT4: return av1_idct4_new;
    117     case TXFM_TYPE_DCT8: return av1_idct8_new;
    118     case TXFM_TYPE_DCT16: return av1_idct16_new;
    119     case TXFM_TYPE_DCT32: return av1_idct32_new;
    120     case TXFM_TYPE_DCT64: return av1_idct64_new;
    121     case TXFM_TYPE_ADST4: return av1_iadst4_new;
    122     case TXFM_TYPE_ADST8: return av1_iadst8_new;
    123     case TXFM_TYPE_ADST16: return av1_iadst16_new;
    124     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
    125     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
    126     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
    127     case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
    128     default: assert(0); return NULL;
    129   }
    130 }
    131 
    132 static const int8_t inv_shift_4x4[2] = { 0, -4 };
    133 static const int8_t inv_shift_8x8[2] = { -1, -4 };
    134 static const int8_t inv_shift_16x16[2] = { -2, -4 };
    135 static const int8_t inv_shift_32x32[2] = { -2, -4 };
    136 static const int8_t inv_shift_64x64[2] = { -2, -4 };
    137 static const int8_t inv_shift_4x8[2] = { 0, -4 };
    138 static const int8_t inv_shift_8x4[2] = { 0, -4 };
    139 static const int8_t inv_shift_8x16[2] = { -1, -4 };
    140 static const int8_t inv_shift_16x8[2] = { -1, -4 };
    141 static const int8_t inv_shift_16x32[2] = { -1, -4 };
    142 static const int8_t inv_shift_32x16[2] = { -1, -4 };
    143 static const int8_t inv_shift_32x64[2] = { -1, -4 };
    144 static const int8_t inv_shift_64x32[2] = { -1, -4 };
    145 static const int8_t inv_shift_4x16[2] = { -1, -4 };
    146 static const int8_t inv_shift_16x4[2] = { -1, -4 };
    147 static const int8_t inv_shift_8x32[2] = { -2, -4 };
    148 static const int8_t inv_shift_32x8[2] = { -2, -4 };
    149 static const int8_t inv_shift_16x64[2] = { -2, -4 };
    150 static const int8_t inv_shift_64x16[2] = { -2, -4 };
    151 
    152 const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
    153   inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
    154   inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
    155   inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
    156   inv_shift_64x32, inv_shift_4x16,  inv_shift_16x4,  inv_shift_8x32,
    157   inv_shift_32x8,  inv_shift_16x64, inv_shift_64x16,
    158 };
    159 
    160 /* clang-format off */
    161 const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
    162                             [MAX_TXWH_IDX] = {  // txh_idx
    163     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
    164     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
    165     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
    166     {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
    167     {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
    168   };
    169 
    170 const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
    171                             [MAX_TXWH_IDX] = {  // txh_idx
    172     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
    173     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
    174     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
    175     {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
    176     {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
    177   };
    178 /* clang-format on */
    179 
    180 const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
    181 
    182 void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
    183                           TXFM_2D_FLIP_CFG *cfg) {
    184   assert(cfg != NULL);
    185   cfg->tx_size = tx_size;
    186   av1_zero(cfg->stage_range_col);
    187   av1_zero(cfg->stage_range_row);
    188   set_flip_cfg(tx_type, cfg);
    189   const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
    190   const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
    191   cfg->shift = inv_txfm_shift_ls[tx_size];
    192   const int txw_idx = get_txw_idx(tx_size);
    193   const int txh_idx = get_txh_idx(tx_size);
    194   cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    195   cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    196   cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
    197   if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
    198     memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
    199   }
    200   cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
    201   if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
    202     memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
    203   }
    204   cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
    205   cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
    206 }
    207 
    208 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
    209                              const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
    210                              int bd) {
    211   const int fwd_shift = inv_start_range[tx_size];
    212   const int8_t *shift = cfg->shift;
    213   int8_t opt_range_row, opt_range_col;
    214   if (bd == 8) {
    215     opt_range_row = 16;
    216     opt_range_col = 16;
    217   } else if (bd == 10) {
    218     opt_range_row = 18;
    219     opt_range_col = 16;
    220   } else {
    221     assert(bd == 12);
    222     opt_range_row = 20;
    223     opt_range_col = 18;
    224   }
    225   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
    226   for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
    227     int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
    228     (void)real_range_row;
    229     if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
    230       // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
    231       // so opt_range_row >= real_range_row will not hold
    232       stage_range_row[i] = opt_range_row;
    233     } else {
    234       assert(opt_range_row >= real_range_row);
    235       stage_range_row[i] = opt_range_row;
    236     }
    237   }
    238   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
    239   for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
    240     int real_range_col =
    241         cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
    242     (void)real_range_col;
    243     if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
    244       // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
    245       // so opt_range_col >= real_range_col will not hold
    246       stage_range_col[i] = opt_range_col;
    247     } else {
    248       assert(opt_range_col >= real_range_col);
    249       stage_range_col[i] = opt_range_col;
    250     }
    251   }
    252 }
    253 
    254 static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
    255                                     int stride, TXFM_2D_FLIP_CFG *cfg,
    256                                     int32_t *txfm_buf, TX_SIZE tx_size,
    257                                     int bd) {
    258   // Note when assigning txfm_size_col, we use the txfm_size from the
    259   // row configuration and vice versa. This is intentionally done to
    260   // accurately perform rectangular transforms. When the transform is
    261   // rectangular, the number of columns will be the same as the
    262   // txfm_size stored in the row cfg struct. It will make no difference
    263   // for square transforms.
    264   const int txfm_size_col = tx_size_wide[cfg->tx_size];
    265   const int txfm_size_row = tx_size_high[cfg->tx_size];
    266   // Take the shift from the larger dimension in the rectangular case.
    267   const int8_t *shift = cfg->shift;
    268   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    269   int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
    270   int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
    271   assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
    272   assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
    273   av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
    274 
    275   const int8_t cos_bit_col = cfg->cos_bit_col;
    276   const int8_t cos_bit_row = cfg->cos_bit_row;
    277   const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
    278   const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
    279 
    280   // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
    281   // AOMMAX(txfm_size_row, txfm_size_col)
    282   // it is used for intermediate data buffering
    283   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
    284   int32_t *temp_in = txfm_buf;
    285   int32_t *temp_out = temp_in + buf_offset;
    286   int32_t *buf = temp_out + buf_offset;
    287   int32_t *buf_ptr = buf;
    288   int c, r;
    289 
    290   // Rows
    291   for (r = 0; r < txfm_size_row; ++r) {
    292     if (abs(rect_type) == 1) {
    293       for (c = 0; c < txfm_size_col; ++c) {
    294         temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
    295       }
    296       clamp_buf(temp_in, txfm_size_col, bd + 8);
    297       txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
    298     } else {
    299       for (c = 0; c < txfm_size_col; ++c) {
    300         temp_in[c] = input[c];
    301       }
    302       clamp_buf(temp_in, txfm_size_col, bd + 8);
    303       txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
    304     }
    305     av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
    306     input += txfm_size_col;
    307     buf_ptr += txfm_size_col;
    308   }
    309 
    310   // Columns
    311   for (c = 0; c < txfm_size_col; ++c) {
    312     if (cfg->lr_flip == 0) {
    313       for (r = 0; r < txfm_size_row; ++r)
    314         temp_in[r] = buf[r * txfm_size_col + c];
    315     } else {
    316       // flip left right
    317       for (r = 0; r < txfm_size_row; ++r)
    318         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
    319     }
    320     clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
    321     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
    322     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
    323     if (cfg->ud_flip == 0) {
    324       for (r = 0; r < txfm_size_row; ++r) {
    325         output[r * stride + c] =
    326             highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
    327       }
    328     } else {
    329       // flip upside down
    330       for (r = 0; r < txfm_size_row; ++r) {
    331         output[r * stride + c] = highbd_clip_pixel_add(
    332             output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
    333       }
    334     }
    335   }
    336 }
    337 
    338 static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
    339                                          int stride, int32_t *txfm_buf,
    340                                          TX_TYPE tx_type, TX_SIZE tx_size,
    341                                          int bd) {
    342   TXFM_2D_FLIP_CFG cfg;
    343   av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
    344   // Forward shift sum uses larger square size, to be consistent with what
    345   // av1_gen_inv_stage_range() does for inverse shifts.
    346   inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
    347 }
    348 
    349 void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
    350                               int stride, TX_TYPE tx_type, int bd) {
    351   DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
    352   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
    353 }
    354 
    355 void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
    356                               int stride, TX_TYPE tx_type, int bd) {
    357   DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
    358   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
    359 }
    360 
    361 void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
    362                                int stride, TX_TYPE tx_type, int bd) {
    363   DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
    364   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
    365 }
    366 
    367 void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
    368                                int stride, TX_TYPE tx_type, int bd) {
    369   DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
    370   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
    371 }
    372 
    373 void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
    374                                 int stride, TX_TYPE tx_type, int bd) {
    375   DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
    376   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
    377 }
    378 
    379 void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
    380                                 int stride, TX_TYPE tx_type, int bd) {
    381   DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
    382   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
    383 }
    384 
    385 void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
    386                               int stride, TX_TYPE tx_type, int bd) {
    387   DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
    388   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
    389 }
    390 
    391 void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
    392                               int stride, TX_TYPE tx_type, int bd) {
    393   DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
    394   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
    395 }
    396 
    397 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
    398                                 int stride, TX_TYPE tx_type, int bd) {
    399   DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
    400   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
    401 }
    402 
    403 void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
    404                                 int stride, TX_TYPE tx_type, int bd) {
    405   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
    406   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
    407 }
    408 
    409 void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
    410                                 int stride, TX_TYPE tx_type, int bd) {
    411   // TODO(urvang): Can the same array be reused, instead of using a new array?
    412   // Remap 32x32 input into a modified 64x64 by:
    413   // - Copying over these values in top-left 32x32 locations.
    414   // - Setting the rest of the locations to 0.
    415   int32_t mod_input[64 * 64];
    416   for (int row = 0; row < 32; ++row) {
    417     memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
    418     memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    419   }
    420   memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
    421   DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
    422   inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
    423                         bd);
    424 }
    425 
    426 void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
    427                                 int stride, TX_TYPE tx_type, int bd) {
    428   // Remap 32x32 input into a modified 64x32 by:
    429   // - Copying over these values in top-left 32x32 locations.
    430   // - Setting the rest of the locations to 0.
    431   int32_t mod_input[64 * 32];
    432   for (int row = 0; row < 32; ++row) {
    433     memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
    434     memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    435   }
    436   DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
    437   inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
    438                         bd);
    439 }
    440 
    441 void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
    442                                 int stride, TX_TYPE tx_type, int bd) {
    443   // Remap 32x32 input into a modified 32x64 input by:
    444   // - Copying over these values in top-left 32x32 locations.
    445   // - Setting the rest of the locations to 0.
    446   int32_t mod_input[32 * 64];
    447   memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
    448   memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
    449   DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
    450   inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
    451                         bd);
    452 }
    453 
    454 void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
    455                                 int stride, TX_TYPE tx_type, int bd) {
    456   // Remap 16x32 input into a modified 16x64 input by:
    457   // - Copying over these values in top-left 16x32 locations.
    458   // - Setting the rest of the locations to 0.
    459   int32_t mod_input[16 * 64];
    460   memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
    461   memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
    462   DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
    463   inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
    464                         bd);
    465 }
    466 
    467 void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
    468                                 int stride, TX_TYPE tx_type, int bd) {
    469   // Remap 32x16 input into a modified 64x16 by:
    470   // - Copying over these values in top-left 32x16 locations.
    471   // - Setting the rest of the locations to 0.
    472   int32_t mod_input[64 * 16];
    473   for (int row = 0; row < 16; ++row) {
    474     memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
    475     memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    476   }
    477   DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
    478   inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
    479                         bd);
    480 }
    481 
    482 void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
    483                                int stride, TX_TYPE tx_type, int bd) {
    484   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
    485   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
    486 }
    487 
    488 void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
    489                                int stride, TX_TYPE tx_type, int bd) {
    490   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
    491   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
    492 }
    493 
    494 void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
    495                                int stride, TX_TYPE tx_type, int bd) {
    496   DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
    497   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
    498 }
    499 
    500 void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
    501                                int stride, TX_TYPE tx_type, int bd) {
    502   DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
    503   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
    504 }
    505