Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_config.h"
     12 #include "./vpx_dsp_rtcd.h"
     13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
     14 #include "vpx_dsp/txfm_common.h"
     15 
     16 #if HAVE_DSPR2
     17 void idct16_rows_dspr2(const int16_t *input, int16_t *output,
     18                        uint32_t no_rows) {
     19   int i;
     20   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
     21   int step1_10, step1_11, step1_12, step1_13;
     22   int step2_0, step2_1, step2_2, step2_3;
     23   int step2_8, step2_9, step2_10, step2_11;
     24   int step2_12, step2_13, step2_14, step2_15;
     25   int load1, load2, load3, load4, load5, load6, load7, load8;
     26   int result1, result2, result3, result4;
     27   const int const_2_power_13 = 8192;
     28 
     29   for (i = no_rows; i--;) {
     30     /* prefetch row */
     31     prefetch_load((const uint8_t *)(input + 16));
     32 
     33     __asm__ __volatile__(
     34         "lh       %[load1],              0(%[input])                    \n\t"
     35         "lh       %[load2],             16(%[input])                    \n\t"
     36         "lh       %[load3],              8(%[input])                    \n\t"
     37         "lh       %[load4],             24(%[input])                    \n\t"
     38 
     39         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     40         "mthi     $zero,                $ac1                            \n\t"
     41         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
     42         "mthi     $zero,                $ac2                            \n\t"
     43         "add      %[result1],           %[load1],       %[load2]        \n\t"
     44         "sub      %[result2],           %[load1],       %[load2]        \n\t"
     45         "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
     46         "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
     47         "extp     %[step2_0],           $ac1,           31              \n\t"
     48         "extp     %[step2_1],           $ac2,           31              \n\t"
     49 
     50         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
     51         "mthi     $zero,                $ac3                            \n\t"
     52         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
     53         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
     54         "extp     %[step2_2],           $ac3,           31              \n\t"
     55 
     56         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     57         "mthi     $zero,                $ac1                            \n\t"
     58         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
     59         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
     60         "extp     %[step2_3],           $ac1,           31              \n\t"
     61 
     62         "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
     63         "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
     64         "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
     65         "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
     66 
     67         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
     68           [load4] "=&r"(load4), [result1] "=&r"(result1),
     69           [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
     70           [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
     71           [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
     72           [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
     73           [step1_3] "=r"(step1_3)
     74         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
     75           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
     76           [cospi_16_64] "r"(cospi_16_64));
     77 
     78     __asm__ __volatile__(
     79         "lh       %[load5],             2(%[input])                     \n\t"
     80         "lh       %[load6],             30(%[input])                    \n\t"
     81         "lh       %[load7],             18(%[input])                    \n\t"
     82         "lh       %[load8],             14(%[input])                    \n\t"
     83 
     84         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     85         "mthi     $zero,                $ac1                            \n\t"
     86         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
     87         "mthi     $zero,                $ac3                            \n\t"
     88 
     89         "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
     90         "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
     91         "extp     %[result1],           $ac1,           31              \n\t"
     92 
     93         "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
     94         "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
     95         "extp     %[result2],           $ac3,           31              \n\t"
     96 
     97         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     98         "mthi     $zero,                $ac1                            \n\t"
     99         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    100         "mthi     $zero,                $ac2                            \n\t"
    101 
    102         "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
    103         "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
    104         "extp     %[result3],           $ac1,           31              \n\t"
    105 
    106         "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
    107         "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
    108         "extp     %[result4],           $ac2,           31              \n\t"
    109 
    110         "sub      %[load5],             %[result1],     %[result2]      \n\t"
    111         "sub      %[load6],             %[result4],     %[result3]      \n\t"
    112 
    113         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    114         "mthi     $zero,                $ac1                            \n\t"
    115         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    116         "mthi     $zero,                $ac3                            \n\t"
    117 
    118         "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
    119         "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
    120         "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
    121         "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
    122 
    123         "extp     %[step2_9],           $ac1,           31              \n\t"
    124         "extp     %[step2_14],          $ac3,           31              \n\t"
    125         "add      %[step2_8],           %[result1],     %[result2]      \n\t"
    126         "add      %[step2_15],          %[result4],     %[result3]      \n\t"
    127 
    128         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
    129           [load8] "=&r"(load8), [result1] "=&r"(result1),
    130           [result2] "=&r"(result2), [result3] "=&r"(result3),
    131           [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
    132           [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
    133           [step2_14] "=r"(step2_14)
    134         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    135           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
    136           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
    137           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
    138 
    139     __asm__ __volatile__(
    140         "lh       %[load1],             10(%[input])                    \n\t"
    141         "lh       %[load2],             22(%[input])                    \n\t"
    142         "lh       %[load3],             26(%[input])                    \n\t"
    143         "lh       %[load4],             6(%[input])                     \n\t"
    144 
    145         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    146         "mthi     $zero,                $ac1                            \n\t"
    147         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    148         "mthi     $zero,                $ac3                            \n\t"
    149 
    150         "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
    151         "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
    152         "extp     %[result1],           $ac1,           31              \n\t"
    153 
    154         "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
    155         "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
    156         "extp     %[result2],           $ac3,           31              \n\t"
    157 
    158         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    159         "mthi     $zero,                $ac1                            \n\t"
    160         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    161         "mthi     $zero,                $ac2                            \n\t"
    162 
    163         "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
    164         "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
    165         "extp     %[result3],           $ac1,           31              \n\t"
    166 
    167         "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
    168         "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
    169         "extp     %[result4],           $ac2,           31              \n\t"
    170 
    171         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    172         "mthi     $zero,                $ac1                            \n\t"
    173         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    174         "mthi     $zero,                $ac3                            \n\t"
    175 
    176         "sub      %[load1],             %[result2],     %[result1]      \n\t"
    177         "sub      %[load2],             %[result4],     %[result3]      \n\t"
    178 
    179         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
    180         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
    181         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
    182         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
    183 
    184         "extp     %[step2_10],          $ac1,           31              \n\t"
    185         "extp     %[step2_13],          $ac3,           31              \n\t"
    186         "add      %[step2_11],          %[result1],     %[result2]      \n\t"
    187         "add      %[step2_12],          %[result4],     %[result3]      \n\t"
    188 
    189         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    190           [load4] "=&r"(load4), [result1] "=&r"(result1),
    191           [result2] "=&r"(result2), [result3] "=&r"(result3),
    192           [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
    193           [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
    194           [step2_13] "=r"(step2_13)
    195         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    196           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
    197           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
    198           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
    199 
    200     __asm__ __volatile__(
    201         "lh       %[load5],             4(%[input])                     \n\t"
    202         "lh       %[load6],             28(%[input])                    \n\t"
    203         "lh       %[load7],             20(%[input])                    \n\t"
    204         "lh       %[load8],             12(%[input])                    \n\t"
    205 
    206         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    207         "mthi     $zero,                $ac1                            \n\t"
    208         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    209         "mthi     $zero,                $ac3                            \n\t"
    210 
    211         "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
    212         "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
    213         "extp     %[result1],           $ac1,           31              \n\t"
    214 
    215         "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
    216         "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
    217         "extp     %[result2],           $ac3,           31              \n\t"
    218 
    219         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    220         "mthi     $zero,                $ac1                            \n\t"
    221         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    222         "mthi     $zero,                $ac2                            \n\t"
    223 
    224         "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
    225         "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
    226         "extp     %[result3],           $ac1,           31              \n\t"
    227 
    228         "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
    229         "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
    230         "extp     %[result4],           $ac2,           31              \n\t"
    231 
    232         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    233         "mthi     $zero,                $ac1                            \n\t"
    234         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    235         "mthi     $zero,                $ac3                            \n\t"
    236 
    237         "sub      %[load5],             %[result4],     %[result3]      \n\t"
    238         "sub      %[load5],             %[load5],       %[result1]      \n\t"
    239         "add      %[load5],             %[load5],       %[result2]      \n\t"
    240 
    241         "sub      %[load6],             %[result1],     %[result2]      \n\t"
    242         "sub      %[load6],             %[load6],       %[result3]      \n\t"
    243         "add      %[load6],             %[load6],       %[result4]      \n\t"
    244 
    245         "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
    246         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
    247 
    248         "extp     %[step1_5],           $ac1,           31              \n\t"
    249         "extp     %[step1_6],           $ac3,           31              \n\t"
    250         "add      %[step1_4],           %[result1],     %[result2]      \n\t"
    251         "add      %[step1_7],           %[result4],     %[result3]      \n\t"
    252 
    253         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
    254           [load8] "=&r"(load8), [result1] "=&r"(result1),
    255           [result2] "=&r"(result2), [result3] "=&r"(result3),
    256           [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
    257           [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
    258           [step1_7] "=r"(step1_7)
    259         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    260           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
    261           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
    262           [cospi_16_64] "r"(cospi_16_64));
    263 
    264     __asm__ __volatile__(
    265         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    266         "mthi     $zero,                $ac0                            \n\t"
    267         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    268         "mthi     $zero,                $ac1                            \n\t"
    269 
    270         "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
    271         "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
    272         "add      %[load5],             %[load5],       %[step2_10]     \n\t"
    273 
    274         "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
    275 
    276         "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
    277         "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
    278         "add      %[load6],             %[load6],       %[step2_9]      \n\t"
    279 
    280         "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
    281 
    282         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    283         "mthi     $zero,                $ac2                            \n\t"
    284         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    285         "mthi     $zero,                $ac3                            \n\t"
    286 
    287         "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
    288         "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
    289         "add      %[load5],             %[load5],       %[step2_11]     \n\t"
    290 
    291         "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
    292 
    293         "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
    294         "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
    295         "add      %[load6],             %[load6],       %[step2_8]      \n\t"
    296 
    297         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
    298 
    299         "extp     %[step1_10],          $ac0,           31              \n\t"
    300         "extp     %[step1_13],          $ac1,           31              \n\t"
    301         "extp     %[step1_11],          $ac2,           31              \n\t"
    302         "extp     %[step1_12],          $ac3,           31              \n\t"
    303 
    304         : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
    305           [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
    306           [step1_13] "=r"(step1_13)
    307         : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
    308           [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
    309           [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
    310           [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
    311           [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
    312 
    313     __asm__ __volatile__(
    314         "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
    315         "add      %[load5],             %[load5],       %[step2_12]     \n\t"
    316         "add      %[load5],             %[load5],       %[step2_15]     \n\t"
    317         "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
    318         "add      %[load6],             %[load6],       %[step2_13]     \n\t"
    319         "add      %[load6],             %[load6],       %[step2_14]     \n\t"
    320         "sh       %[load5],             0(%[output])                    \n\t"
    321         "sh       %[load6],             32(%[output])                   \n\t"
    322         "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
    323         "add      %[load5],             %[load5],       %[step2_9]      \n\t"
    324         "add      %[load5],             %[load5],       %[step2_10]     \n\t"
    325         "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
    326         "add      %[load6],             %[load6],       %[step2_8]      \n\t"
    327         "add      %[load6],             %[load6],       %[step2_11]     \n\t"
    328         "sh       %[load5],             192(%[output])                  \n\t"
    329         "sh       %[load6],             224(%[output])                  \n\t"
    330         "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
    331         "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
    332         "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
    333         "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
    334         "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
    335         "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
    336         "sh       %[load5],             256(%[output])                  \n\t"
    337         "sh       %[load6],             288(%[output])                  \n\t"
    338         "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
    339         "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
    340         "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
    341         "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
    342         "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
    343         "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
    344         "sh       %[load5],             448(%[output])                  \n\t"
    345         "sh       %[load6],             480(%[output])                  \n\t"
    346 
    347         : [load5] "=&r"(load5), [load6] "=&r"(load6)
    348         : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
    349           [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
    350           [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
    351           [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
    352           [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
    353           [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
    354 
    355     __asm__ __volatile__(
    356         "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
    357         "add      %[load5],             %[load5],       %[step1_13]     \n\t"
    358         "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
    359         "add      %[load6],             %[load6],       %[step1_12]     \n\t"
    360         "sh       %[load5],             64(%[output])                   \n\t"
    361         "sh       %[load6],             96(%[output])                   \n\t"
    362         "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
    363         "add      %[load5],             %[load5],       %[step1_11]     \n\t"
    364         "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
    365         "add      %[load6],             %[load6],       %[step1_10]     \n\t"
    366         "sh       %[load5],             128(%[output])                  \n\t"
    367         "sh       %[load6],             160(%[output])                  \n\t"
    368         "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
    369         "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
    370         "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
    371         "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
    372         "sh       %[load5],             320(%[output])                  \n\t"
    373         "sh       %[load6],             352(%[output])                  \n\t"
    374         "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
    375         "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
    376         "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
    377         "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
    378         "sh       %[load5],             384(%[output])                  \n\t"
    379         "sh       %[load6],             416(%[output])                  \n\t"
    380 
    381         : [load5] "=&r"(load5), [load6] "=&r"(load6)
    382         : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
    383           [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
    384           [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
    385           [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
    386 
    387     input += 16;
    388     output += 1;
    389   }
    390 }
    391 
    392 void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
    393   int i;
    394   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
    395   int step1_8, step1_9, step1_10, step1_11;
    396   int step1_12, step1_13, step1_14, step1_15;
    397   int step2_0, step2_1, step2_2, step2_3;
    398   int step2_8, step2_9, step2_10, step2_11;
    399   int step2_12, step2_13, step2_14, step2_15;
    400   int load1, load2, load3, load4, load5, load6, load7, load8;
    401   int result1, result2, result3, result4;
    402   const int const_2_power_13 = 8192;
    403   uint8_t *dest_pix;
    404   uint8_t *cm = vpx_ff_cropTbl;
    405 
    406   /* prefetch vpx_ff_cropTbl */
    407   prefetch_load(vpx_ff_cropTbl);
    408   prefetch_load(vpx_ff_cropTbl + 32);
    409   prefetch_load(vpx_ff_cropTbl + 64);
    410   prefetch_load(vpx_ff_cropTbl + 96);
    411   prefetch_load(vpx_ff_cropTbl + 128);
    412   prefetch_load(vpx_ff_cropTbl + 160);
    413   prefetch_load(vpx_ff_cropTbl + 192);
    414   prefetch_load(vpx_ff_cropTbl + 224);
    415 
    416   for (i = 0; i < 16; ++i) {
    417     dest_pix = (dest + i);
    418     __asm__ __volatile__(
    419         "lh       %[load1],              0(%[input])                    \n\t"
    420         "lh       %[load2],             16(%[input])                    \n\t"
    421         "lh       %[load3],              8(%[input])                    \n\t"
    422         "lh       %[load4],             24(%[input])                    \n\t"
    423 
    424         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    425         "mthi     $zero,                $ac1                            \n\t"
    426         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    427         "mthi     $zero,                $ac2                            \n\t"
    428         "add      %[result1],           %[load1],       %[load2]        \n\t"
    429         "sub      %[result2],           %[load1],       %[load2]        \n\t"
    430         "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
    431         "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
    432         "extp     %[step2_0],           $ac1,           31              \n\t"
    433         "extp     %[step2_1],           $ac2,           31              \n\t"
    434 
    435         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    436         "mthi     $zero,                $ac3                            \n\t"
    437         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
    438         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
    439         "extp     %[step2_2],           $ac3,           31              \n\t"
    440 
    441         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    442         "mthi     $zero,                $ac1                            \n\t"
    443         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
    444         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
    445         "extp     %[step2_3],           $ac1,           31              \n\t"
    446 
    447         "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
    448         "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
    449         "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
    450         "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
    451 
    452         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    453           [load4] "=&r"(load4), [result1] "=&r"(result1),
    454           [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
    455           [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
    456           [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
    457           [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
    458           [step1_3] "=r"(step1_3)
    459         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    460           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
    461           [cospi_16_64] "r"(cospi_16_64));
    462 
    463     __asm__ __volatile__(
    464         "lh       %[load5],             2(%[input])                     \n\t"
    465         "lh       %[load6],             30(%[input])                    \n\t"
    466         "lh       %[load7],             18(%[input])                    \n\t"
    467         "lh       %[load8],             14(%[input])                    \n\t"
    468 
    469         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    470         "mthi     $zero,                $ac1                            \n\t"
    471         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    472         "mthi     $zero,                $ac3                            \n\t"
    473 
    474         "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
    475         "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
    476         "extp     %[result1],           $ac1,           31              \n\t"
    477 
    478         "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
    479         "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
    480         "extp     %[result2],           $ac3,           31              \n\t"
    481 
    482         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    483         "mthi     $zero,                $ac1                            \n\t"
    484         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    485         "mthi     $zero,                $ac2                            \n\t"
    486 
    487         "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
    488         "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
    489         "extp     %[result3],           $ac1,           31              \n\t"
    490 
    491         "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
    492         "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
    493         "extp     %[result4],           $ac2,            31             \n\t"
    494 
    495         "sub      %[load5],             %[result1],     %[result2]      \n\t"
    496         "sub      %[load6],             %[result4],     %[result3]      \n\t"
    497 
    498         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    499         "mthi     $zero,                $ac1                            \n\t"
    500         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    501         "mthi     $zero,                $ac3                            \n\t"
    502 
    503         "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
    504         "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
    505         "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
    506         "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
    507 
    508         "extp     %[step2_9],           $ac1,           31              \n\t"
    509         "extp     %[step2_14],          $ac3,           31              \n\t"
    510         "add      %[step2_8],           %[result1],     %[result2]      \n\t"
    511         "add      %[step2_15],          %[result4],     %[result3]      \n\t"
    512 
    513         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
    514           [load8] "=&r"(load8), [result1] "=&r"(result1),
    515           [result2] "=&r"(result2), [result3] "=&r"(result3),
    516           [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
    517           [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
    518           [step2_14] "=r"(step2_14)
    519         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    520           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
    521           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
    522           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
    523 
    524     __asm__ __volatile__(
    525         "lh       %[load1],             10(%[input])                    \n\t"
    526         "lh       %[load2],             22(%[input])                    \n\t"
    527         "lh       %[load3],             26(%[input])                    \n\t"
    528         "lh       %[load4],             6(%[input])                     \n\t"
    529 
    530         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    531         "mthi     $zero,                $ac1                            \n\t"
    532         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    533         "mthi     $zero,                $ac3                            \n\t"
    534 
    535         "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
    536         "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
    537         "extp     %[result1],           $ac1,        31                 \n\t"
    538 
    539         "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
    540         "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
    541         "extp     %[result2],           $ac3,        31                 \n\t"
    542 
    543         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    544         "mthi     $zero,                $ac1                            \n\t"
    545         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    546         "mthi     $zero,                $ac2                            \n\t"
    547 
    548         "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
    549         "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
    550         "extp     %[result3],           $ac1,        31                 \n\t"
    551 
    552         "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
    553         "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
    554         "extp     %[result4],           $ac2,        31                 \n\t"
    555 
    556         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    557         "mthi     $zero,                $ac1                            \n\t"
    558         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    559         "mthi     $zero,                $ac3                            \n\t"
    560 
    561         "sub      %[load1],             %[result2],     %[result1]      \n\t"
    562         "sub      %[load2],             %[result4],     %[result3]      \n\t"
    563 
    564         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
    565         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
    566         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
    567         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
    568 
    569         "extp     %[step2_10],          $ac1,           31              \n\t"
    570         "extp     %[step2_13],          $ac3,           31              \n\t"
    571         "add      %[step2_11],          %[result1],     %[result2]      \n\t"
    572         "add      %[step2_12],          %[result4],     %[result3]      \n\t"
    573 
    574         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    575           [load4] "=&r"(load4), [result1] "=&r"(result1),
    576           [result2] "=&r"(result2), [result3] "=&r"(result3),
    577           [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
    578           [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
    579           [step2_13] "=r"(step2_13)
    580         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    581           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
    582           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
    583           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
    584 
    585     __asm__ __volatile__(
    586         "lh       %[load5],             4(%[input])                   \n\t"
    587         "lh       %[load6],             28(%[input])                  \n\t"
    588         "lh       %[load7],             20(%[input])                  \n\t"
    589         "lh       %[load8],             12(%[input])                  \n\t"
    590 
    591         "mtlo     %[const_2_power_13],  $ac1                          \n\t"
    592         "mthi     $zero,                $ac1                          \n\t"
    593         "mtlo     %[const_2_power_13],  $ac3                          \n\t"
    594         "mthi     $zero,                $ac3                          \n\t"
    595 
    596         "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
    597         "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
    598         "extp     %[result1],           $ac1,        31               \n\t"
    599 
    600         "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
    601         "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
    602         "extp     %[result2],           $ac3,        31               \n\t"
    603 
    604         "mtlo     %[const_2_power_13],  $ac1                          \n\t"
    605         "mthi     $zero,                $ac1                          \n\t"
    606         "mtlo     %[const_2_power_13],  $ac2                          \n\t"
    607         "mthi     $zero,                $ac2                          \n\t"
    608 
    609         "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
    610         "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
    611         "extp     %[result3],           $ac1,        31               \n\t"
    612 
    613         "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
    614         "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
    615         "extp     %[result4],           $ac2,        31               \n\t"
    616 
    617         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    618         "mthi     $zero,                $ac1                            \n\t"
    619         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    620         "mthi     $zero,                $ac3                            \n\t"
    621 
    622         "sub      %[load5],             %[result4],     %[result3]      \n\t"
    623         "sub      %[load5],             %[load5],       %[result1]      \n\t"
    624         "add      %[load5],             %[load5],       %[result2]      \n\t"
    625 
    626         "sub      %[load6],             %[result1],     %[result2]      \n\t"
    627         "sub      %[load6],             %[load6],       %[result3]      \n\t"
    628         "add      %[load6],             %[load6],       %[result4]      \n\t"
    629 
    630         "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
    631         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
    632 
    633         "extp     %[step1_5],           $ac1,           31              \n\t"
    634         "extp     %[step1_6],           $ac3,           31              \n\t"
    635 
    636         "add      %[step1_4],           %[result1],     %[result2]      \n\t"
    637         "add      %[step1_7],           %[result4],     %[result3]      \n\t"
    638 
    639         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
    640           [load8] "=&r"(load8), [result1] "=&r"(result1),
    641           [result2] "=&r"(result2), [result3] "=&r"(result3),
    642           [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
    643           [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
    644           [step1_7] "=r"(step1_7)
    645         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    646           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
    647           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
    648           [cospi_16_64] "r"(cospi_16_64));
    649 
    650     __asm__ __volatile__(
    651         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    652         "mthi     $zero,                $ac0                            \n\t"
    653         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    654         "mthi     $zero,                $ac1                            \n\t"
    655 
    656         "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
    657         "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
    658         "add      %[load5],             %[load5],       %[step2_10]     \n\t"
    659 
    660         "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
    661 
    662         "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
    663         "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
    664         "add      %[load6],             %[load6],       %[step2_9]      \n\t"
    665 
    666         "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
    667 
    668         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    669         "mthi     $zero,                $ac2                            \n\t"
    670         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    671         "mthi     $zero,                $ac3                            \n\t"
    672 
    673         "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
    674         "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
    675         "add      %[load5],             %[load5],       %[step2_11]     \n\t"
    676 
    677         "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
    678 
    679         "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
    680         "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
    681         "add      %[load6],             %[load6],       %[step2_8]      \n\t"
    682 
    683         "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
    684 
    685         "extp     %[step1_10],          $ac0,           31              \n\t"
    686         "extp     %[step1_13],          $ac1,           31              \n\t"
    687         "extp     %[step1_11],          $ac2,           31              \n\t"
    688         "extp     %[step1_12],          $ac3,           31              \n\t"
    689 
    690         : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
    691           [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
    692           [step1_13] "=r"(step1_13)
    693         : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
    694           [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
    695           [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
    696           [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
    697           [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
    698 
    699     step1_8 = step2_8 + step2_11;
    700     step1_9 = step2_9 + step2_10;
    701     step1_14 = step2_13 + step2_14;
    702     step1_15 = step2_12 + step2_15;
    703 
    704     __asm__ __volatile__(
    705         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    706         "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
    707         "add      %[load5],         %[load5],           %[step1_15]     \n\t"
    708         "addi     %[load5],         %[load5],           32              \n\t"
    709         "sra      %[load5],         %[load5],           6               \n\t"
    710         "add      %[load7],         %[load7],           %[load5]        \n\t"
    711         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    712         "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
    713         "add      %[load6],         %[load6],           %[step1_14]     \n\t"
    714         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    715         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    716         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    717         "addi     %[load6],         %[load6],           32              \n\t"
    718         "sra      %[load6],         %[load6],           6               \n\t"
    719         "add      %[load8],         %[load8],           %[load6]        \n\t"
    720         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    721         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    722         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    723 
    724         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    725         "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
    726         "add      %[load5],         %[load5],           %[step1_13]     \n\t"
    727         "addi     %[load5],         %[load5],           32              \n\t"
    728         "sra      %[load5],         %[load5],           6               \n\t"
    729         "add      %[load7],         %[load7],           %[load5]        \n\t"
    730         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    731         "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
    732         "add      %[load6],         %[load6],           %[step1_12]     \n\t"
    733         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    734         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    735         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    736         "addi     %[load6],         %[load6],           32              \n\t"
    737         "sra      %[load6],         %[load6],           6               \n\t"
    738         "add      %[load8],         %[load8],           %[load6]        \n\t"
    739         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    740         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    741         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    742 
    743         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    744         "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
    745         "add      %[load5],         %[load5],           %[step1_11]     \n\t"
    746         "addi     %[load5],         %[load5],           32              \n\t"
    747         "sra      %[load5],         %[load5],           6               \n\t"
    748         "add      %[load7],         %[load7],           %[load5]        \n\t"
    749         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    750         "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
    751         "add      %[load6],         %[load6],           %[step1_10]     \n\t"
    752         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    753         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    754         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    755         "addi     %[load6],         %[load6],           32              \n\t"
    756         "sra      %[load6],         %[load6],           6               \n\t"
    757         "add      %[load8],         %[load8],           %[load6]        \n\t"
    758         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    759         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    760         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    761 
    762         "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
    763         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    764         "add      %[load5],         %[load5],           %[step1_9]      \n\t"
    765         "addi     %[load5],         %[load5],           32              \n\t"
    766         "sra      %[load5],         %[load5],           6               \n\t"
    767         "add      %[load7],         %[load7],           %[load5]        \n\t"
    768         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    769         "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
    770         "add      %[load6],         %[load6],           %[step1_8]      \n\t"
    771         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    772         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    773         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    774         "addi     %[load6],         %[load6],           32              \n\t"
    775         "sra      %[load6],         %[load6],           6               \n\t"
    776         "add      %[load8],         %[load8],           %[load6]        \n\t"
    777         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    778         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    779         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    780 
    781         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    782         "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
    783         "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
    784         "addi     %[load5],         %[load5],           32              \n\t"
    785         "sra      %[load5],         %[load5],           6               \n\t"
    786         "add      %[load7],         %[load7],           %[load5]        \n\t"
    787         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    788         "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
    789         "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
    790         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    791         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    792         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    793         "addi     %[load6],         %[load6],           32              \n\t"
    794         "sra      %[load6],         %[load6],           6               \n\t"
    795         "add      %[load8],         %[load8],           %[load6]        \n\t"
    796         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    797         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    798         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    799 
    800         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    801         "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
    802         "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
    803         "addi     %[load5],         %[load5],           32              \n\t"
    804         "sra      %[load5],         %[load5],           6               \n\t"
    805         "add      %[load7],         %[load7],           %[load5]        \n\t"
    806         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    807         "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
    808         "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
    809         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    810         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    811         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    812         "addi     %[load6],         %[load6],           32              \n\t"
    813         "sra      %[load6],         %[load6],           6               \n\t"
    814         "add      %[load8],         %[load8],           %[load6]        \n\t"
    815         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    816         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    817         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    818 
    819         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    820         "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
    821         "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
    822         "addi     %[load5],         %[load5],           32              \n\t"
    823         "sra      %[load5],         %[load5],           6               \n\t"
    824         "add      %[load7],         %[load7],           %[load5]        \n\t"
    825         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    826         "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
    827         "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
    828         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    829         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    830         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    831         "addi     %[load6],         %[load6],           32              \n\t"
    832         "sra      %[load6],         %[load6],           6               \n\t"
    833         "add      %[load8],         %[load8],           %[load6]        \n\t"
    834         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    835         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    836         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    837 
    838         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
    839         "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
    840         "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
    841         "addi     %[load5],         %[load5],           32              \n\t"
    842         "sra      %[load5],         %[load5],           6               \n\t"
    843         "add      %[load7],         %[load7],           %[load5]        \n\t"
    844         "lbux     %[load5],         %[load7](%[cm])                     \n\t"
    845         "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
    846         "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
    847         "sb       %[load5],         0(%[dest_pix])                      \n\t"
    848         "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
    849         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
    850         "addi     %[load6],         %[load6],           32              \n\t"
    851         "sra      %[load6],         %[load6],           6               \n\t"
    852         "add      %[load8],         %[load8],           %[load6]        \n\t"
    853         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
    854         "sb       %[load6],         0(%[dest_pix])                      \n\t"
    855 
    856         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
    857           [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
    858         :
    859         [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
    860         [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
    861         [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
    862         [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
    863         [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
    864         [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
    865         [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
    866 
    867     input += 16;
    868   }
    869 }
    870 
    871 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
    872                                  int stride) {
    873   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
    874   uint32_t pos = 45;
    875 
    876   /* bit positon for extract from acc */
    877   __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
    878 
    879   // First transform rows
    880   idct16_rows_dspr2(input, out, 16);
    881 
    882   // Then transform columns and add to dest
    883   idct16_cols_add_blk_dspr2(out, dest, stride);
    884 }
    885 
    886 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
    887                                 int stride) {
    888   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
    889   int16_t *outptr = out;
    890   uint32_t i;
    891   uint32_t pos = 45;
    892 
    893   /* bit positon for extract from acc */
    894   __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
    895 
    896   // First transform rows. Since all non-zero dct coefficients are in
    897   // upper-left 4x4 area, we only need to calculate first 4 rows here.
    898   idct16_rows_dspr2(input, outptr, 4);
    899 
    900   outptr += 4;
    901   for (i = 0; i < 6; ++i) {
    902     __asm__ __volatile__(
    903         "sw     $zero,    0(%[outptr])     \n\t"
    904         "sw     $zero,   32(%[outptr])     \n\t"
    905         "sw     $zero,   64(%[outptr])     \n\t"
    906         "sw     $zero,   96(%[outptr])     \n\t"
    907         "sw     $zero,  128(%[outptr])     \n\t"
    908         "sw     $zero,  160(%[outptr])     \n\t"
    909         "sw     $zero,  192(%[outptr])     \n\t"
    910         "sw     $zero,  224(%[outptr])     \n\t"
    911         "sw     $zero,  256(%[outptr])     \n\t"
    912         "sw     $zero,  288(%[outptr])     \n\t"
    913         "sw     $zero,  320(%[outptr])     \n\t"
    914         "sw     $zero,  352(%[outptr])     \n\t"
    915         "sw     $zero,  384(%[outptr])     \n\t"
    916         "sw     $zero,  416(%[outptr])     \n\t"
    917         "sw     $zero,  448(%[outptr])     \n\t"
    918         "sw     $zero,  480(%[outptr])     \n\t"
    919 
    920         :
    921         : [outptr] "r"(outptr));
    922 
    923     outptr += 2;
    924   }
    925 
    926   // Then transform columns
    927   idct16_cols_add_blk_dspr2(out, dest, stride);
    928 }
    929 
    930 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
    931                                int stride) {
    932   uint32_t pos = 45;
    933   int32_t out;
    934   int32_t r;
    935   int32_t a1, absa1;
    936   int32_t vector_a1;
    937   int32_t t1, t2, t3, t4;
    938   int32_t vector_1, vector_2, vector_3, vector_4;
    939 
    940   /* bit positon for extract from acc */
    941   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    942 
    943                        :
    944                        : [pos] "r"(pos));
    945 
    946   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
    947   __asm__ __volatile__(
    948       "addi     %[out],     %[out],     32      \n\t"
    949       "sra      %[a1],      %[out],     6       \n\t"
    950 
    951       : [out] "+r"(out), [a1] "=r"(a1)
    952       :);
    953 
    954   if (a1 < 0) {
    955     /* use quad-byte
    956      * input and output memory are four byte aligned */
    957     __asm__ __volatile__(
    958         "abs        %[absa1],       %[a1]       \n\t"
    959         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
    960 
    961         : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
    962         : [a1] "r"(a1));
    963 
    964     for (r = 16; r--;) {
    965       __asm__ __volatile__(
    966           "lw             %[t1],          0(%[dest])                      \n\t"
    967           "lw             %[t2],          4(%[dest])                      \n\t"
    968           "lw             %[t3],          8(%[dest])                      \n\t"
    969           "lw             %[t4],          12(%[dest])                     \n\t"
    970           "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
    971           "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
    972           "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
    973           "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
    974           "sw             %[vector_1],    0(%[dest])                      \n\t"
    975           "sw             %[vector_2],    4(%[dest])                      \n\t"
    976           "sw             %[vector_3],    8(%[dest])                      \n\t"
    977           "sw             %[vector_4],    12(%[dest])                     \n\t"
    978           "add            %[dest],        %[dest],        %[stride]       \n\t"
    979 
    980           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
    981             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
    982             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
    983             [dest] "+&r"(dest)
    984           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
    985     }
    986   } else if (a1 > 255) {
    987     int32_t a11, a12, vector_a11, vector_a12;
    988 
    989     /* use quad-byte
    990      * input and output memory are four byte aligned */
    991     a11 = a1 >> 1;
    992     a12 = a1 - a11;
    993     __asm__ __volatile__(
    994         "replv.qb       %[vector_a11],  %[a11]     \n\t"
    995         "replv.qb       %[vector_a12],  %[a12]     \n\t"
    996 
    997         : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
    998         : [a11] "r"(a11), [a12] "r"(a12));
    999 
   1000     for (r = 16; r--;) {
   1001       __asm__ __volatile__(
   1002           "lw             %[t1],          0(%[dest])                      \n\t"
   1003           "lw             %[t2],          4(%[dest])                      \n\t"
   1004           "lw             %[t3],          8(%[dest])                      \n\t"
   1005           "lw             %[t4],          12(%[dest])                     \n\t"
   1006           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
   1007           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
   1008           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
   1009           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
   1010           "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
   1011           "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
   1012           "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
   1013           "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
   1014           "sw             %[vector_1],    0(%[dest])                      \n\t"
   1015           "sw             %[vector_2],    4(%[dest])                      \n\t"
   1016           "sw             %[vector_3],    8(%[dest])                      \n\t"
   1017           "sw             %[vector_4],    12(%[dest])                     \n\t"
   1018           "add            %[dest],        %[dest],        %[stride]       \n\t"
   1019 
   1020           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
   1021             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
   1022             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
   1023             [dest] "+&r"(dest)
   1024           : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
   1025             [vector_a12] "r"(vector_a12));
   1026     }
   1027   } else {
   1028     /* use quad-byte
   1029      * input and output memory are four byte aligned */
   1030     __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
   1031 
   1032                          : [vector_a1] "=r"(vector_a1)
   1033                          : [a1] "r"(a1));
   1034 
   1035     for (r = 16; r--;) {
   1036       __asm__ __volatile__(
   1037           "lw             %[t1],          0(%[dest])                      \n\t"
   1038           "lw             %[t2],          4(%[dest])                      \n\t"
   1039           "lw             %[t3],          8(%[dest])                      \n\t"
   1040           "lw             %[t4],          12(%[dest])                     \n\t"
   1041           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
   1042           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
   1043           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
   1044           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
   1045           "sw             %[vector_1],    0(%[dest])                      \n\t"
   1046           "sw             %[vector_2],    4(%[dest])                      \n\t"
   1047           "sw             %[vector_3],    8(%[dest])                      \n\t"
   1048           "sw             %[vector_4],    12(%[dest])                     \n\t"
   1049           "add            %[dest],        %[dest],        %[stride]       \n\t"
   1050 
   1051           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
   1052             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
   1053             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
   1054             [dest] "+&r"(dest)
   1055           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
   1056     }
   1057   }
   1058 }
   1059 
   1060 void iadst16_dspr2(const int16_t *input, int16_t *output) {
   1061   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
   1062 
   1063   int x0 = input[15];
   1064   int x1 = input[0];
   1065   int x2 = input[13];
   1066   int x3 = input[2];
   1067   int x4 = input[11];
   1068   int x5 = input[4];
   1069   int x6 = input[9];
   1070   int x7 = input[6];
   1071   int x8 = input[7];
   1072   int x9 = input[8];
   1073   int x10 = input[5];
   1074   int x11 = input[10];
   1075   int x12 = input[3];
   1076   int x13 = input[12];
   1077   int x14 = input[1];
   1078   int x15 = input[14];
   1079 
   1080   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
   1081         x13 | x14 | x15)) {
   1082     output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
   1083         output[6] = output[7] = output[8] = output[9] = output[10] =
   1084             output[11] = output[12] = output[13] = output[14] = output[15] = 0;
   1085     return;
   1086   }
   1087 
   1088   // stage 1
   1089   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
   1090   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   1091   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
   1092   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   1093   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
   1094   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   1095   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   1096   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   1097   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   1098   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   1099   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   1100   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   1101   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   1102   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
   1103   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   1104   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
   1105 
   1106   x0 = dct_const_round_shift(s0 + s8);
   1107   x1 = dct_const_round_shift(s1 + s9);
   1108   x2 = dct_const_round_shift(s2 + s10);
   1109   x3 = dct_const_round_shift(s3 + s11);
   1110   x4 = dct_const_round_shift(s4 + s12);
   1111   x5 = dct_const_round_shift(s5 + s13);
   1112   x6 = dct_const_round_shift(s6 + s14);
   1113   x7 = dct_const_round_shift(s7 + s15);
   1114   x8 = dct_const_round_shift(s0 - s8);
   1115   x9 = dct_const_round_shift(s1 - s9);
   1116   x10 = dct_const_round_shift(s2 - s10);
   1117   x11 = dct_const_round_shift(s3 - s11);
   1118   x12 = dct_const_round_shift(s4 - s12);
   1119   x13 = dct_const_round_shift(s5 - s13);
   1120   x14 = dct_const_round_shift(s6 - s14);
   1121   x15 = dct_const_round_shift(s7 - s15);
   1122 
   1123   // stage 2
   1124   s0 = x0;
   1125   s1 = x1;
   1126   s2 = x2;
   1127   s3 = x3;
   1128   s4 = x4;
   1129   s5 = x5;
   1130   s6 = x6;
   1131   s7 = x7;
   1132   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
   1133   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
   1134   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
   1135   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
   1136   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
   1137   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
   1138   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   1139   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
   1140 
   1141   x0 = s0 + s4;
   1142   x1 = s1 + s5;
   1143   x2 = s2 + s6;
   1144   x3 = s3 + s7;
   1145   x4 = s0 - s4;
   1146   x5 = s1 - s5;
   1147   x6 = s2 - s6;
   1148   x7 = s3 - s7;
   1149   x8 = dct_const_round_shift(s8 + s12);
   1150   x9 = dct_const_round_shift(s9 + s13);
   1151   x10 = dct_const_round_shift(s10 + s14);
   1152   x11 = dct_const_round_shift(s11 + s15);
   1153   x12 = dct_const_round_shift(s8 - s12);
   1154   x13 = dct_const_round_shift(s9 - s13);
   1155   x14 = dct_const_round_shift(s10 - s14);
   1156   x15 = dct_const_round_shift(s11 - s15);
   1157 
   1158   // stage 3
   1159   s0 = x0;
   1160   s1 = x1;
   1161   s2 = x2;
   1162   s3 = x3;
   1163   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
   1164   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   1165   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
   1166   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
   1167   s8 = x8;
   1168   s9 = x9;
   1169   s10 = x10;
   1170   s11 = x11;
   1171   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
   1172   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   1173   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   1174   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
   1175 
   1176   x0 = s0 + s2;
   1177   x1 = s1 + s3;
   1178   x2 = s0 - s2;
   1179   x3 = s1 - s3;
   1180   x4 = dct_const_round_shift(s4 + s6);
   1181   x5 = dct_const_round_shift(s5 + s7);
   1182   x6 = dct_const_round_shift(s4 - s6);
   1183   x7 = dct_const_round_shift(s5 - s7);
   1184   x8 = s8 + s10;
   1185   x9 = s9 + s11;
   1186   x10 = s8 - s10;
   1187   x11 = s9 - s11;
   1188   x12 = dct_const_round_shift(s12 + s14);
   1189   x13 = dct_const_round_shift(s13 + s15);
   1190   x14 = dct_const_round_shift(s12 - s14);
   1191   x15 = dct_const_round_shift(s13 - s15);
   1192 
   1193   // stage 4
   1194   s2 = (-cospi_16_64) * (x2 + x3);
   1195   s3 = cospi_16_64 * (x2 - x3);
   1196   s6 = cospi_16_64 * (x6 + x7);
   1197   s7 = cospi_16_64 * (-x6 + x7);
   1198   s10 = cospi_16_64 * (x10 + x11);
   1199   s11 = cospi_16_64 * (-x10 + x11);
   1200   s14 = (-cospi_16_64) * (x14 + x15);
   1201   s15 = cospi_16_64 * (x14 - x15);
   1202 
   1203   x2 = dct_const_round_shift(s2);
   1204   x3 = dct_const_round_shift(s3);
   1205   x6 = dct_const_round_shift(s6);
   1206   x7 = dct_const_round_shift(s7);
   1207   x10 = dct_const_round_shift(s10);
   1208   x11 = dct_const_round_shift(s11);
   1209   x14 = dct_const_round_shift(s14);
   1210   x15 = dct_const_round_shift(s15);
   1211 
   1212   output[0] = x0;
   1213   output[1] = -x8;
   1214   output[2] = x12;
   1215   output[3] = -x4;
   1216   output[4] = x6;
   1217   output[5] = x14;
   1218   output[6] = x10;
   1219   output[7] = x2;
   1220   output[8] = x3;
   1221   output[9] = x11;
   1222   output[10] = x15;
   1223   output[11] = x7;
   1224   output[12] = x5;
   1225   output[13] = -x13;
   1226   output[14] = x9;
   1227   output[15] = -x1;
   1228 }
   1229 
   1230 #endif  // HAVE_DSPR2
   1231