Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_config.h"
     12 #include "./vpx_dsp_rtcd.h"
     13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
     14 #include "vpx_dsp/txfm_common.h"
     15 
     16 #if HAVE_DSPR2
     17 void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
     18   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
     19   const int const_2_power_13 = 8192;
     20   int Temp0, Temp1, Temp2, Temp3, Temp4;
     21   int i;
     22 
     23   for (i = no_rows; i--; ) {
     24     __asm__ __volatile__ (
     25         /*
     26           temp_1 = (input[0] + input[4]) * cospi_16_64;
     27           step2_0 = dct_const_round_shift(temp_1);
     28 
     29           temp_2 = (input[0] - input[4]) * cospi_16_64;
     30           step2_1 = dct_const_round_shift(temp_2);
     31         */
     32         "lh       %[Temp0],             0(%[input])                     \n\t"
     33         "lh       %[Temp1],             8(%[input])                     \n\t"
     34         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
     35         "mthi     $zero,                $ac0                            \n\t"
     36         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     37         "mthi     $zero,                $ac1                            \n\t"
     38         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
     39         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
     40         "extp     %[Temp4],             $ac0,           31              \n\t"
     41 
     42         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
     43         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
     44         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
     45         "mthi     $zero,                $ac0                            \n\t"
     46         "extp     %[Temp2],             $ac1,           31              \n\t"
     47 
     48         /*
     49           temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
     50           step2_2 = dct_const_round_shift(temp_1);
     51         */
     52         "lh       %[Temp0],             4(%[input])                     \n\t"
     53         "lh       %[Temp1],             12(%[input])                    \n\t"
     54         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
     55         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
     56         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     57         "mthi     $zero,                $ac1                            \n\t"
     58         "extp     %[Temp3],             $ac0,           31              \n\t"
     59 
     60         /*
     61           step1_1 = step2_1 + step2_2;
     62           step1_2 = step2_1 - step2_2;
     63         */
     64         "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
     65         "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
     66 
     67         /*
     68           temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
     69           step2_3 = dct_const_round_shift(temp_2);
     70         */
     71         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
     72         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
     73         "extp     %[Temp1],             $ac1,           31              \n\t"
     74 
     75         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
     76         "mthi     $zero,                $ac0                            \n\t"
     77 
     78         /*
     79           step1_0 = step2_0 + step2_3;
     80           step1_3 = step2_0 - step2_3;
     81         */
     82         "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
     83         "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
     84 
     85         /*
     86           temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
     87           step1_4 = dct_const_round_shift(temp_1);
     88         */
     89         "lh       %[Temp0],             2(%[input])                     \n\t"
     90         "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
     91         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     92         "mthi     $zero,                $ac1                            \n\t"
     93         "lh       %[Temp1],             14(%[input])                    \n\t"
     94         "lh       %[Temp0],             2(%[input])                     \n\t"
     95         "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
     96         "extp     %[step1_4],           $ac0,           31              \n\t"
     97 
     98         /*
     99           temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    100           step1_7 = dct_const_round_shift(temp_2);
    101         */
    102         "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
    103         "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
    104         "extp     %[step1_7],           $ac1,           31              \n\t"
    105 
    106         /*
    107           temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    108           step1_5 = dct_const_round_shift(temp_1);
    109         */
    110         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    111         "mthi     $zero,                $ac0                            \n\t"
    112         "lh       %[Temp0],             10(%[input])                    \n\t"
    113         "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
    114         "lh       %[Temp1],             6(%[input])                     \n\t"
    115         "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
    116         "extp     %[step1_5],           $ac0,           31              \n\t"
    117 
    118         /*
    119           temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    120           step1_6 = dct_const_round_shift(temp_2);
    121         */
    122         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    123         "mthi     $zero,                $ac1                            \n\t"
    124         "lh       %[Temp0],             10(%[input])                    \n\t"
    125         "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
    126         "lh       %[Temp1],             6(%[input])                     \n\t"
    127         "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
    128         "extp     %[step1_6],           $ac1,           31              \n\t"
    129 
    130         /*
    131           temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
    132           temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
    133         */
    134         "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
    135         "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
    136         "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
    137         "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
    138         "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
    139         "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
    140 
    141         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    142         "mthi     $zero,                $ac0                            \n\t"
    143         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    144         "mthi     $zero,                $ac1                            \n\t"
    145 
    146         "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
    147         "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
    148 
    149         /*
    150           step1_4 = step1_4 + step1_5;
    151           step1_7 = step1_6 + step1_7;
    152         */
    153         "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
    154         "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
    155 
    156         "extp     %[step1_5],           $ac0,           31              \n\t"
    157         "extp     %[step1_6],           $ac1,           31              \n\t"
    158 
    159         "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
    160         "sh       %[Temp0],             0(%[output])                    \n\t"
    161         "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
    162         "sh       %[Temp1],             16(%[output])                   \n\t"
    163         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
    164         "sh       %[Temp0],             32(%[output])                   \n\t"
    165         "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
    166         "sh       %[Temp1],             48(%[output])                   \n\t"
    167 
    168         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
    169         "sh       %[Temp0],             64(%[output])                   \n\t"
    170         "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
    171         "sh       %[Temp1],             80(%[output])                   \n\t"
    172         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
    173         "sh       %[Temp0],             96(%[output])                   \n\t"
    174         "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
    175         "sh       %[Temp1],             112(%[output])                  \n\t"
    176 
    177         : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
    178           [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
    179           [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
    180           [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
    181           [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
    182           [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    183           [Temp4] "=&r" (Temp4)
    184         : [const_2_power_13] "r" (const_2_power_13),
    185           [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
    186           [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
    187           [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
    188           [cospi_24_64] "r" (cospi_24_64),
    189           [output] "r" (output), [input] "r" (input)
    190     );
    191 
    192     input += 8;
    193     output += 1;
    194   }
    195 }
    196 
    197 void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
    198                                  int dest_stride) {
    199   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
    200   int Temp0, Temp1, Temp2, Temp3;
    201   int i;
    202   const int const_2_power_13 = 8192;
    203   uint8_t *dest_pix;
    204   uint8_t *cm = vpx_ff_cropTbl;
    205 
    206   /* prefetch vpx_ff_cropTbl */
    207   prefetch_load(vpx_ff_cropTbl);
    208   prefetch_load(vpx_ff_cropTbl +  32);
    209   prefetch_load(vpx_ff_cropTbl +  64);
    210   prefetch_load(vpx_ff_cropTbl +  96);
    211   prefetch_load(vpx_ff_cropTbl + 128);
    212   prefetch_load(vpx_ff_cropTbl + 160);
    213   prefetch_load(vpx_ff_cropTbl + 192);
    214   prefetch_load(vpx_ff_cropTbl + 224);
    215 
    216   for (i = 0; i < 8; ++i) {
    217       dest_pix = (dest + i);
    218 
    219     __asm__ __volatile__ (
    220         /*
    221           temp_1 = (input[0] + input[4]) * cospi_16_64;
    222           step2_0 = dct_const_round_shift(temp_1);
    223 
    224           temp_2 = (input[0] - input[4]) * cospi_16_64;
    225           step2_1 = dct_const_round_shift(temp_2);
    226         */
    227         "lh       %[Temp0],             0(%[input])                     \n\t"
    228         "lh       %[Temp1],             8(%[input])                     \n\t"
    229         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    230         "mthi     $zero,                $ac0                            \n\t"
    231         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    232         "mthi     $zero,                $ac1                            \n\t"
    233         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
    234         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
    235         "extp     %[step1_6],           $ac0,           31              \n\t"
    236 
    237         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
    238         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
    239         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    240         "mthi     $zero,                $ac0                            \n\t"
    241         "extp     %[Temp2],             $ac1,           31              \n\t"
    242 
    243         /*
    244           temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
    245           step2_2 = dct_const_round_shift(temp_1);
    246         */
    247         "lh       %[Temp0],             4(%[input])                     \n\t"
    248         "lh       %[Temp1],             12(%[input])                    \n\t"
    249         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
    250         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
    251         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    252         "mthi     $zero,                $ac1                            \n\t"
    253         "extp     %[Temp3],             $ac0,           31              \n\t"
    254 
    255         /*
    256           step1_1 = step2_1 + step2_2;
    257           step1_2 = step2_1 - step2_2;
    258         */
    259         "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
    260         "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
    261 
    262         /*
    263           temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
    264           step2_3 = dct_const_round_shift(temp_2);
    265         */
    266         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
    267         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
    268         "extp     %[Temp1],             $ac1,           31              \n\t"
    269 
    270         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    271         "mthi     $zero,                $ac0                            \n\t"
    272 
    273         /*
    274           step1_0 = step2_0 + step2_3;
    275           step1_3 = step2_0 - step2_3;
    276         */
    277         "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
    278         "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
    279 
    280         /*
    281           temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    282           step1_4 = dct_const_round_shift(temp_1);
    283         */
    284         "lh       %[Temp0],             2(%[input])                     \n\t"
    285         "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
    286         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    287         "mthi     $zero,                $ac1                            \n\t"
    288         "lh       %[Temp1],             14(%[input])                    \n\t"
    289         "lh       %[Temp0],             2(%[input])                     \n\t"
    290         "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
    291         "extp     %[step1_4],           $ac0,           31              \n\t"
    292 
    293         /*
    294           temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    295           step1_7 = dct_const_round_shift(temp_2);
    296         */
    297         "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
    298         "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
    299         "extp     %[step1_7],           $ac1,           31              \n\t"
    300 
    301         /*
    302           temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    303           step1_5 = dct_const_round_shift(temp_1);
    304         */
    305         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    306         "mthi     $zero,                $ac0                            \n\t"
    307         "lh       %[Temp0],             10(%[input])                    \n\t"
    308         "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
    309         "lh       %[Temp1],             6(%[input])                     \n\t"
    310         "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
    311         "extp     %[step1_5],           $ac0,           31              \n\t"
    312 
    313         /*
    314           temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    315           step1_6 = dct_const_round_shift(temp_2);
    316         */
    317         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    318         "mthi     $zero,                $ac1                            \n\t"
    319         "lh       %[Temp0],             10(%[input])                    \n\t"
    320         "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
    321         "lh       %[Temp1],             6(%[input])                     \n\t"
    322         "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
    323         "extp     %[step1_6],           $ac1,           31              \n\t"
    324 
    325         /*
    326           temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
    327           temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
    328         */
    329         "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
    330         "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
    331         "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
    332         "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
    333         "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
    334         "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
    335 
    336         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    337         "mthi     $zero,                $ac0                            \n\t"
    338         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    339         "mthi     $zero,                $ac1                            \n\t"
    340 
    341         "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
    342         "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
    343 
    344         /*
    345           step1_4 = step1_4 + step1_5;
    346           step1_7 = step1_6 + step1_7;
    347         */
    348         "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
    349         "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
    350 
    351         "extp     %[step1_5],           $ac0,           31              \n\t"
    352         "extp     %[step1_6],           $ac1,           31              \n\t"
    353 
    354         /* add block */
    355         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    356         "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
    357         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    358         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    359         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    360         "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
    361         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    362         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    363         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    364 
    365         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    366         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    367         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    368         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    369         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
    370         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    371         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    372         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    373 
    374         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    375         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    376         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    377         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    378         "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
    379         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    380         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    381         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    382 
    383         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    384         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    385         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    386         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    387         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
    388         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    389         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    390         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    391 
    392         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    393         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    394         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    395         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    396         "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
    397         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    398         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    399         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    400 
    401         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    402         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    403         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    404         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    405         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
    406         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    407         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    408         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    409 
    410         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    411         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    412         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    413         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    414         "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
    415         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    416         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    417         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    418 
    419         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    420         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    421         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    422         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    423         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    424         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    425 
    426         : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
    427           [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
    428           [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
    429           [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
    430           [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
    431           [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    432           [dest_pix] "+r" (dest_pix)
    433         : [const_2_power_13] "r" (const_2_power_13),
    434           [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
    435           [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
    436           [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
    437           [cospi_24_64] "r" (cospi_24_64),
    438           [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
    439     );
    440 
    441     input += 8;
    442   }
    443 }
    444 
    445 void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
    446                               int dest_stride) {
    447   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
    448   int16_t *outptr = out;
    449   uint32_t pos = 45;
    450 
    451   /* bit positon for extract from acc */
    452   __asm__ __volatile__ (
    453     "wrdsp    %[pos],    1    \n\t"
    454     :
    455     : [pos] "r" (pos)
    456   );
    457 
    458   // First transform rows
    459   idct8_rows_dspr2(input, outptr, 8);
    460 
    461   // Then transform columns and add to dest
    462   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
    463 }
    464 
    465 void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
    466                               int dest_stride) {
    467   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
    468   int16_t *outptr = out;
    469   uint32_t pos = 45;
    470 
    471   /* bit positon for extract from acc */
    472   __asm__ __volatile__ (
    473     "wrdsp    %[pos],    1    \n\t"
    474     :
    475     : [pos] "r" (pos)
    476   );
    477 
    478   // First transform rows
    479   idct8_rows_dspr2(input, outptr, 4);
    480 
    481   outptr += 4;
    482 
    483   __asm__ __volatile__ (
    484       "sw  $zero,   0(%[outptr])  \n\t"
    485       "sw  $zero,   4(%[outptr])  \n\t"
    486       "sw  $zero,  16(%[outptr])  \n\t"
    487       "sw  $zero,  20(%[outptr])  \n\t"
    488       "sw  $zero,  32(%[outptr])  \n\t"
    489       "sw  $zero,  36(%[outptr])  \n\t"
    490       "sw  $zero,  48(%[outptr])  \n\t"
    491       "sw  $zero,  52(%[outptr])  \n\t"
    492       "sw  $zero,  64(%[outptr])  \n\t"
    493       "sw  $zero,  68(%[outptr])  \n\t"
    494       "sw  $zero,  80(%[outptr])  \n\t"
    495       "sw  $zero,  84(%[outptr])  \n\t"
    496       "sw  $zero,  96(%[outptr])  \n\t"
    497       "sw  $zero, 100(%[outptr])  \n\t"
    498       "sw  $zero, 112(%[outptr])  \n\t"
    499       "sw  $zero, 116(%[outptr])  \n\t"
    500 
    501       :
    502       : [outptr] "r" (outptr)
    503   );
    504 
    505 
    506   // Then transform columns and add to dest
    507   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
    508 }
    509 
    510 void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
    511                              int dest_stride) {
    512   uint32_t pos = 45;
    513   int32_t out;
    514   int32_t r;
    515   int32_t a1, absa1;
    516   int32_t t1, t2, vector_a1, vector_1, vector_2;
    517 
    518   /* bit positon for extract from acc */
    519   __asm__ __volatile__ (
    520     "wrdsp      %[pos],     1           \n\t"
    521 
    522     :
    523     : [pos] "r" (pos)
    524   );
    525 
    526   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
    527   __asm__ __volatile__ (
    528       "addi     %[out],     %[out],     16      \n\t"
    529       "sra      %[a1],      %[out],     5       \n\t"
    530 
    531       : [out] "+r" (out), [a1] "=r" (a1)
    532       :
    533   );
    534 
    535   if (a1 < 0) {
    536     /* use quad-byte
    537      * input and output memory are four byte aligned */
    538     __asm__ __volatile__ (
    539         "abs        %[absa1],       %[a1]       \n\t"
    540         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
    541 
    542         : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
    543         : [a1] "r" (a1)
    544     );
    545 
    546     for (r = 8; r--;) {
    547       __asm__ __volatile__ (
    548           "lw           %[t1],          0(%[dest])                      \n\t"
    549           "lw           %[t2],          4(%[dest])                      \n\t"
    550           "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
    551           "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
    552           "sw           %[vector_1],    0(%[dest])                      \n\t"
    553           "sw           %[vector_2],    4(%[dest])                      \n\t"
    554           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
    555 
    556           : [t1] "=&r" (t1), [t2] "=&r" (t2),
    557             [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
    558             [dest] "+&r" (dest)
    559           : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
    560       );
    561     }
    562   } else {
    563     /* use quad-byte
    564      * input and output memory are four byte aligned */
    565     __asm__ __volatile__ (
    566         "replv.qb   %[vector_a1],   %[a1]   \n\t"
    567 
    568         : [vector_a1] "=r" (vector_a1)
    569         : [a1] "r" (a1)
    570     );
    571 
    572     for (r = 8; r--;) {
    573       __asm__ __volatile__ (
    574           "lw           %[t1],          0(%[dest])                      \n\t"
    575           "lw           %[t2],          4(%[dest])                      \n\t"
    576           "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
    577           "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
    578           "sw           %[vector_1],    0(%[dest])                      \n\t"
    579           "sw           %[vector_2],    4(%[dest])                      \n\t"
    580           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
    581 
    582           : [t1] "=&r" (t1), [t2] "=&r" (t2),
    583             [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
    584             [dest] "+r" (dest)
    585           : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
    586       );
    587     }
    588   }
    589 }
    590 
    591 void iadst8_dspr2(const int16_t *input, int16_t *output) {
    592   int s0, s1, s2, s3, s4, s5, s6, s7;
    593   int x0, x1, x2, x3, x4, x5, x6, x7;
    594 
    595   x0 = input[7];
    596   x1 = input[0];
    597   x2 = input[5];
    598   x3 = input[2];
    599   x4 = input[3];
    600   x5 = input[4];
    601   x6 = input[1];
    602   x7 = input[6];
    603 
    604   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    605     output[0] = output[1] = output[2] = output[3] = output[4]
    606               = output[5] = output[6] = output[7] = 0;
    607     return;
    608   }
    609 
    610   // stage 1
    611   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
    612   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
    613   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    614   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    615   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    616   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    617   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
    618   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
    619 
    620   x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
    621   x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
    622   x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
    623   x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
    624   x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
    625   x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
    626   x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
    627   x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
    628 
    629   // stage 2
    630   s0 = x0;
    631   s1 = x1;
    632   s2 = x2;
    633   s3 = x3;
    634   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
    635   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
    636   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
    637   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
    638 
    639   x0 = s0 + s2;
    640   x1 = s1 + s3;
    641   x2 = s0 - s2;
    642   x3 = s1 - s3;
    643   x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
    644   x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
    645   x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
    646   x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
    647 
    648   // stage 3
    649   s2 = cospi_16_64 * (x2 + x3);
    650   s3 = cospi_16_64 * (x2 - x3);
    651   s6 = cospi_16_64 * (x6 + x7);
    652   s7 = cospi_16_64 * (x6 - x7);
    653 
    654   x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
    655   x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
    656   x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
    657   x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
    658 
    659   output[0] =  x0;
    660   output[1] = -x4;
    661   output[2] =  x6;
    662   output[3] = -x2;
    663   output[4] =  x3;
    664   output[5] = -x7;
    665   output[6] =  x5;
    666   output[7] = -x1;
    667 }
    668 #endif  // HAVE_DSPR2
    669