Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
     16 #include "vpx_dsp/txfm_common.h"
     17 
     18 #if HAVE_DSPR2
     19 static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     20                               uint32_t no_rows) {
     21   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
     22   int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
     23   int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
     24   int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
     25   int step1_28, step1_29, step1_30, step1_31;
     26   int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
     27   int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
     28   int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
     29   int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
     30   int step2_28, step2_29, step2_30, step2_31;
     31   int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
     32   int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
     33   int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
     34   int step3_29, step3_30, step3_31;
     35   int temp0, temp1, temp2, temp3;
     36   int load1, load2, load3, load4;
     37   int result1, result2;
     38   int i;
     39   const int const_2_power_13 = 8192;
     40   const int32_t *input_int;
     41 
     42   for (i = no_rows; i--;) {
     43     input_int = (const int32_t *)input;
     44 
     45     if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
     46           input_int[4] | input_int[5] | input_int[6] | input_int[7] |
     47           input_int[8] | input_int[9] | input_int[10] | input_int[11] |
     48           input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
     49       input += 32;
     50 
     51       __asm__ __volatile__(
     52           "sh     $zero,     0(%[output])     \n\t"
     53           "sh     $zero,    64(%[output])     \n\t"
     54           "sh     $zero,   128(%[output])     \n\t"
     55           "sh     $zero,   192(%[output])     \n\t"
     56           "sh     $zero,   256(%[output])     \n\t"
     57           "sh     $zero,   320(%[output])     \n\t"
     58           "sh     $zero,   384(%[output])     \n\t"
     59           "sh     $zero,   448(%[output])     \n\t"
     60           "sh     $zero,   512(%[output])     \n\t"
     61           "sh     $zero,   576(%[output])     \n\t"
     62           "sh     $zero,   640(%[output])     \n\t"
     63           "sh     $zero,   704(%[output])     \n\t"
     64           "sh     $zero,   768(%[output])     \n\t"
     65           "sh     $zero,   832(%[output])     \n\t"
     66           "sh     $zero,   896(%[output])     \n\t"
     67           "sh     $zero,   960(%[output])     \n\t"
     68           "sh     $zero,  1024(%[output])     \n\t"
     69           "sh     $zero,  1088(%[output])     \n\t"
     70           "sh     $zero,  1152(%[output])     \n\t"
     71           "sh     $zero,  1216(%[output])     \n\t"
     72           "sh     $zero,  1280(%[output])     \n\t"
     73           "sh     $zero,  1344(%[output])     \n\t"
     74           "sh     $zero,  1408(%[output])     \n\t"
     75           "sh     $zero,  1472(%[output])     \n\t"
     76           "sh     $zero,  1536(%[output])     \n\t"
     77           "sh     $zero,  1600(%[output])     \n\t"
     78           "sh     $zero,  1664(%[output])     \n\t"
     79           "sh     $zero,  1728(%[output])     \n\t"
     80           "sh     $zero,  1792(%[output])     \n\t"
     81           "sh     $zero,  1856(%[output])     \n\t"
     82           "sh     $zero,  1920(%[output])     \n\t"
     83           "sh     $zero,  1984(%[output])     \n\t"
     84 
     85           :
     86           : [output] "r"(output));
     87 
     88       output += 1;
     89 
     90       continue;
     91     }
     92 
     93     /* prefetch row */
     94     prefetch_load((const uint8_t *)(input + 32));
     95     prefetch_load((const uint8_t *)(input + 48));
     96 
     97     __asm__ __volatile__(
     98         "lh       %[load1],             2(%[input])                     \n\t"
     99         "lh       %[load2],             62(%[input])                    \n\t"
    100         "lh       %[load3],             34(%[input])                    \n\t"
    101         "lh       %[load4],             30(%[input])                    \n\t"
    102 
    103         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    104         "mthi     $zero,                $ac1                            \n\t"
    105         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    106         "mthi     $zero,                $ac3                            \n\t"
    107 
    108         "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
    109         "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
    110         "extp     %[temp0],             $ac1,           31              \n\t"
    111 
    112         "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
    113         "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
    114         "extp     %[temp3],             $ac3,           31              \n\t"
    115 
    116         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    117         "mthi     $zero,                $ac1                            \n\t"
    118         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    119         "mthi     $zero,                $ac2                            \n\t"
    120 
    121         "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
    122         "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
    123         "extp     %[temp1],             $ac2,           31              \n\t"
    124 
    125         "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
    126         "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
    127         "extp     %[temp2],             $ac1,           31              \n\t"
    128 
    129         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    130         "mthi     $zero,                $ac1                            \n\t"
    131         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    132         "mthi     $zero,                $ac3                            \n\t"
    133 
    134         "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
    135         "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
    136 
    137         "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
    138         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
    139         "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
    140         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
    141 
    142         "extp     %[step1_17],          $ac1,           31              \n\t"
    143         "extp     %[step1_30],          $ac3,           31              \n\t"
    144         "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
    145         "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
    146 
    147         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    148           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    149           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
    150           [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
    151           [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
    152         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    153           [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
    154           [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
    155           [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
    156 
    157     __asm__ __volatile__(
    158         "lh       %[load1],             18(%[input])                    \n\t"
    159         "lh       %[load2],             46(%[input])                    \n\t"
    160         "lh       %[load3],             50(%[input])                    \n\t"
    161         "lh       %[load4],             14(%[input])                    \n\t"
    162 
    163         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    164         "mthi     $zero,                $ac1                            \n\t"
    165         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    166         "mthi     $zero,                $ac3                            \n\t"
    167 
    168         "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
    169         "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
    170         "extp     %[temp0],             $ac1,           31              \n\t"
    171 
    172         "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
    173         "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
    174         "extp     %[temp3],             $ac3,           31              \n\t"
    175 
    176         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    177         "mthi     $zero,                $ac1                            \n\t"
    178         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    179         "mthi     $zero,                $ac2                            \n\t"
    180 
    181         "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
    182         "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
    183         "extp     %[temp1],             $ac2,           31              \n\t"
    184 
    185         "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
    186         "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
    187         "extp     %[temp2],             $ac1,           31              \n\t"
    188 
    189         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    190         "mthi     $zero,                $ac1                            \n\t"
    191         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    192         "mthi     $zero,                $ac3                            \n\t"
    193 
    194         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
    195         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
    196 
    197         "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
    198         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
    199         "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
    200         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
    201 
    202         "extp     %[step1_18],          $ac1,           31              \n\t"
    203         "extp     %[step1_29],          $ac3,           31              \n\t"
    204         "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
    205         "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
    206 
    207         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    208           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    209           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
    210           [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
    211           [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
    212         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    213           [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
    214           [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
    215           [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
    216 
    217     __asm__ __volatile__(
    218         "lh       %[load1],             10(%[input])                    \n\t"
    219         "lh       %[load2],             54(%[input])                    \n\t"
    220         "lh       %[load3],             42(%[input])                    \n\t"
    221         "lh       %[load4],             22(%[input])                    \n\t"
    222 
    223         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    224         "mthi     $zero,                $ac1                            \n\t"
    225         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    226         "mthi     $zero,                $ac3                            \n\t"
    227 
    228         "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
    229         "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
    230         "extp     %[temp0],             $ac1,           31              \n\t"
    231 
    232         "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
    233         "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
    234         "extp     %[temp3],             $ac3,           31              \n\t"
    235 
    236         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    237         "mthi     $zero,                $ac1                            \n\t"
    238         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    239         "mthi     $zero,                $ac2                            \n\t"
    240 
    241         "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
    242         "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
    243         "extp     %[temp1],             $ac2,           31              \n\t"
    244 
    245         "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
    246         "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
    247         "extp     %[temp2],             $ac1,           31              \n\t"
    248 
    249         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    250         "mthi     $zero,                $ac1                            \n\t"
    251         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    252         "mthi     $zero,                $ac3                            \n\t"
    253 
    254         "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
    255         "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
    256 
    257         "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
    258         "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
    259         "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
    260         "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
    261 
    262         "extp     %[step1_21],          $ac1,           31              \n\t"
    263         "extp     %[step1_26],          $ac3,           31              \n\t"
    264         "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
    265         "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
    266 
    267         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    268           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    269           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
    270           [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
    271           [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
    272         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    273           [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
    274           [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
    275           [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
    276 
    277     __asm__ __volatile__(
    278         "lh       %[load1],             26(%[input])                    \n\t"
    279         "lh       %[load2],             38(%[input])                    \n\t"
    280         "lh       %[load3],             58(%[input])                    \n\t"
    281         "lh       %[load4],              6(%[input])                    \n\t"
    282 
    283         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    284         "mthi     $zero,                $ac1                            \n\t"
    285         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    286         "mthi     $zero,                $ac3                            \n\t"
    287 
    288         "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
    289         "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
    290         "extp     %[temp0],             $ac1,           31              \n\t"
    291         "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
    292         "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
    293         "extp     %[temp3],             $ac3,           31              \n\t"
    294 
    295         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    296         "mthi     $zero,                $ac1                            \n\t"
    297         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    298         "mthi     $zero,                $ac2                            \n\t"
    299 
    300         "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
    301         "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
    302         "extp     %[temp1],             $ac2,           31              \n\t"
    303         "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
    304         "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
    305         "extp     %[temp2],             $ac1,           31              \n\t"
    306 
    307         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    308         "mthi     $zero,                $ac1                            \n\t"
    309         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    310         "mthi     $zero,                $ac3                            \n\t"
    311 
    312         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
    313         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
    314         "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
    315         "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
    316         "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
    317         "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
    318         "extp     %[step1_22],          $ac1,           31              \n\t"
    319         "extp     %[step1_25],          $ac3,           31              \n\t"
    320         "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
    321         "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
    322 
    323         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    324           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    325           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
    326           [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
    327           [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
    328         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    329           [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
    330           [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
    331           [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
    332 
    333     __asm__ __volatile__(
    334         "lh       %[load1],              4(%[input])                    \n\t"
    335         "lh       %[load2],             60(%[input])                    \n\t"
    336         "lh       %[load3],             36(%[input])                    \n\t"
    337         "lh       %[load4],             28(%[input])                    \n\t"
    338 
    339         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    340         "mthi     $zero,                $ac1                            \n\t"
    341         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    342         "mthi     $zero,                $ac3                            \n\t"
    343 
    344         "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
    345         "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
    346         "extp     %[temp0],             $ac1,           31              \n\t"
    347         "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
    348         "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
    349         "extp     %[temp3],             $ac3,           31              \n\t"
    350 
    351         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    352         "mthi     $zero,                $ac1                            \n\t"
    353         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    354         "mthi     $zero,                $ac2                            \n\t"
    355 
    356         "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
    357         "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
    358         "extp     %[temp1],             $ac2,           31              \n\t"
    359         "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
    360         "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
    361         "extp     %[temp2],             $ac1,           31              \n\t"
    362 
    363         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    364         "mthi     $zero,                $ac1                            \n\t"
    365         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    366         "mthi     $zero,                $ac3                            \n\t"
    367 
    368         "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
    369         "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
    370         "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
    371         "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
    372         "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
    373         "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
    374         "extp     %[step2_9],           $ac1,           31              \n\t"
    375         "extp     %[step2_14],          $ac3,           31              \n\t"
    376         "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
    377         "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
    378 
    379         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    380           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    381           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
    382           [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
    383           [step2_15] "=&r"(step2_15)
    384         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    385           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
    386           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
    387           [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
    388 
    389     __asm__ __volatile__(
    390         "lh       %[load1],             20(%[input])                    \n\t"
    391         "lh       %[load2],             44(%[input])                    \n\t"
    392         "lh       %[load3],             52(%[input])                    \n\t"
    393         "lh       %[load4],             12(%[input])                    \n\t"
    394 
    395         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    396         "mthi     $zero,                $ac1                            \n\t"
    397         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    398         "mthi     $zero,                $ac3                            \n\t"
    399 
    400         "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
    401         "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
    402         "extp     %[temp0],             $ac1,           31              \n\t"
    403         "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
    404         "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
    405         "extp     %[temp3],             $ac3,           31              \n\t"
    406 
    407         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    408         "mthi     $zero,                $ac1                            \n\t"
    409         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    410         "mthi     $zero,                $ac2                            \n\t"
    411 
    412         "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
    413         "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
    414         "extp     %[temp1],             $ac2,           31              \n\t"
    415         "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
    416         "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
    417         "extp     %[temp2],             $ac1,           31              \n\t"
    418 
    419         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    420         "mthi     $zero,                $ac1                            \n\t"
    421         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    422         "mthi     $zero,                $ac3                            \n\t"
    423 
    424         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
    425         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
    426         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
    427         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
    428         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
    429         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
    430         "extp     %[step2_10],          $ac1,           31              \n\t"
    431         "extp     %[step2_13],          $ac3,           31              \n\t"
    432         "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
    433         "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
    434 
    435         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    436           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    437           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
    438           [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
    439           [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
    440         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    441           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
    442           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
    443           [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
    444 
    445     __asm__ __volatile__(
    446         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    447         "mthi     $zero,                $ac0                            \n\t"
    448         "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
    449         "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
    450         "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
    451         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    452         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    453         "mthi     $zero,                $ac1                            \n\t"
    454         "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
    455         "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
    456         "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
    457         "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
    458         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    459         "mthi     $zero,                $ac2                            \n\t"
    460         "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
    461         "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
    462         "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
    463         "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
    464         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    465         "mthi     $zero,                $ac3                            \n\t"
    466         "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
    467         "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
    468         "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
    469         "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
    470 
    471         "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
    472         "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
    473         "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
    474         "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
    475         "extp     %[step3_10],          $ac0,           31              \n\t"
    476         "extp     %[step3_13],          $ac1,           31              \n\t"
    477         "extp     %[step3_11],          $ac2,           31              \n\t"
    478         "extp     %[step3_12],          $ac3,           31              \n\t"
    479 
    480         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
    481           [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
    482           [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
    483           [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
    484           [step3_15] "=&r"(step3_15)
    485         : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
    486           [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
    487           [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
    488           [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
    489           [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
    490 
    491     __asm__ __volatile__(
    492         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    493         "mthi     $zero,                $ac0                            \n\t"
    494         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    495         "mthi     $zero,                $ac1                            \n\t"
    496         "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
    497         "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
    498         "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
    499         "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
    500 
    501         "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
    502         "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
    503         "extp     %[step3_18],          $ac0,           31              \n\t"
    504         "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
    505         "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
    506         "extp     %[step3_29],          $ac1,           31              \n\t"
    507 
    508         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    509           [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
    510           [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
    511         : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
    512           [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
    513           [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
    514           [cospi_8_64] "r"(cospi_8_64));
    515 
    516     __asm__ __volatile__(
    517         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    518         "mthi     $zero,                $ac0                            \n\t"
    519         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    520         "mthi     $zero,                $ac1                            \n\t"
    521         "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
    522         "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
    523         "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
    524         "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
    525 
    526         "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
    527         "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
    528         "extp     %[step3_19],          $ac0,           31              \n\t"
    529         "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
    530         "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
    531         "extp     %[step3_28],          $ac1,           31              \n\t"
    532 
    533         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    534           [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
    535           [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
    536         : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
    537           [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
    538           [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
    539           [cospi_8_64] "r"(cospi_8_64));
    540 
    541     __asm__ __volatile__(
    542         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    543         "mthi     $zero,                $ac0                            \n\t"
    544         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    545         "mthi     $zero,                $ac1                            \n\t"
    546         "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
    547         "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
    548         "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
    549         "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
    550 
    551         "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
    552         "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
    553         "extp     %[step3_27],          $ac0,           31              \n\t"
    554         "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
    555         "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
    556         "extp     %[step3_20],          $ac1,           31              \n\t"
    557 
    558         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    559           [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
    560           [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
    561         : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
    562           [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
    563           [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
    564           [cospi_8_64] "r"(cospi_8_64));
    565 
    566     __asm__ __volatile__(
    567         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    568         "mthi     $zero,                $ac0                            \n\t"
    569         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    570         "mthi     $zero,                $ac1                            \n\t"
    571         "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
    572         "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
    573         "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
    574         "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
    575 
    576         "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
    577         "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
    578         "extp     %[step3_21],          $ac0,           31              \n\t"
    579         "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
    580         "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
    581         "extp     %[step3_26],          $ac1,           31              \n\t"
    582 
    583         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    584           [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
    585           [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
    586         : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
    587           [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
    588           [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
    589           [cospi_8_64] "r"(cospi_8_64));
    590 
    591     __asm__ __volatile__(
    592         "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
    593         "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
    594         "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
    595         "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
    596         "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
    597         "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
    598         "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
    599         "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
    600 
    601         : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
    602           [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
    603           [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
    604           [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
    605         : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
    606           [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
    607           [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
    608           [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
    609 
    610     __asm__ __volatile__(
    611         "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
    612         "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
    613         "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
    614         "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
    615         "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
    616         "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
    617         "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
    618         "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
    619 
    620         : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
    621           [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
    622           [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
    623           [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
    624         : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
    625           [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
    626           [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
    627           [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
    628 
    629     __asm__ __volatile__(
    630         "lh       %[load1],             0(%[input])                     \n\t"
    631         "lh       %[load2],             32(%[input])                    \n\t"
    632         "lh       %[load3],             16(%[input])                    \n\t"
    633         "lh       %[load4],             48(%[input])                    \n\t"
    634 
    635         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    636         "mthi     $zero,                $ac1                            \n\t"
    637         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    638         "mthi     $zero,                $ac2                            \n\t"
    639         "add      %[result1],           %[load1],       %[load2]        \n\t"
    640         "sub      %[result2],           %[load1],       %[load2]        \n\t"
    641         "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
    642         "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
    643         "extp     %[temp0],             $ac1,           31              \n\t"
    644         "extp     %[temp1],             $ac2,           31              \n\t"
    645 
    646         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    647         "mthi     $zero,                $ac3                            \n\t"
    648         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
    649         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
    650         "extp     %[temp2],             $ac3,           31              \n\t"
    651         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    652         "mthi     $zero,                $ac1                            \n\t"
    653         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
    654         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
    655         "extp     %[temp3],             $ac1,           31              \n\t"
    656         "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
    657         "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
    658         "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
    659         "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
    660 
    661         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    662           [load4] "=&r"(load4), [result1] "=&r"(result1),
    663           [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    664           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
    665           [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
    666           [step1_3] "=&r"(step1_3)
    667         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    668           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
    669           [cospi_16_64] "r"(cospi_16_64));
    670 
    671     __asm__ __volatile__(
    672         "lh       %[load1],             8(%[input])                     \n\t"
    673         "lh       %[load2],             56(%[input])                    \n\t"
    674         "lh       %[load3],             40(%[input])                    \n\t"
    675         "lh       %[load4],             24(%[input])                    \n\t"
    676 
    677         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    678         "mthi     $zero,                $ac1                            \n\t"
    679         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    680         "mthi     $zero,                $ac3                            \n\t"
    681 
    682         "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
    683         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
    684         "extp     %[temp0],             $ac1,           31              \n\t"
    685         "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
    686         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
    687         "extp     %[temp3],             $ac3,           31              \n\t"
    688 
    689         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    690         "mthi     $zero,                $ac1                            \n\t"
    691         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    692         "mthi     $zero,                $ac2                            \n\t"
    693 
    694         "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
    695         "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
    696         "extp     %[temp1],             $ac2,           31              \n\t"
    697         "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
    698         "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
    699         "extp     %[temp2],             $ac1,           31              \n\t"
    700 
    701         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    702         "mthi     $zero,                $ac1                            \n\t"
    703         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    704         "mthi     $zero,                $ac3                            \n\t"
    705 
    706         "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
    707         "sub      %[load1],             %[load1],       %[temp0]        \n\t"
    708         "add      %[load1],             %[load1],       %[temp1]        \n\t"
    709         "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
    710         "sub      %[load2],             %[load2],       %[temp2]        \n\t"
    711         "add      %[load2],             %[load2],       %[temp3]        \n\t"
    712         "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
    713         "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
    714 
    715         "extp     %[step1_5],           $ac1,           31              \n\t"
    716         "extp     %[step1_6],           $ac3,           31              \n\t"
    717         "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
    718         "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
    719 
    720         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
    721           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
    722           [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
    723           [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
    724           [step1_7] "=&r"(step1_7)
    725         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
    726           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
    727           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
    728           [cospi_16_64] "r"(cospi_16_64));
    729 
    730     __asm__ __volatile__(
    731         "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
    732         "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
    733         "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
    734         "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
    735         "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
    736         "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
    737         "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
    738         "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
    739 
    740         : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
    741           [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
    742           [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
    743           [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
    744         : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
    745           [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
    746           [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
    747           [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
    748 
    749     // stage 7
    750     __asm__ __volatile__(
    751         "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
    752         "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
    753         "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
    754         "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
    755         "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
    756         "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
    757         "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
    758         "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
    759 
    760         : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
    761           [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
    762           [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
    763           [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
    764         : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
    765           [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
    766           [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
    767           [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
    768 
    769     __asm__ __volatile__(
    770         "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
    771         "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
    772         "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
    773         "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
    774         "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
    775         "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
    776         "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
    777         "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
    778 
    779         : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
    780           [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
    781           [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
    782           [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
    783         : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
    784           [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
    785           [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
    786           [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
    787 
    788     __asm__ __volatile__(
    789         "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
    790         "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
    791         "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
    792         "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
    793 
    794         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    795         "mthi     $zero,                $ac0                            \n\t"
    796         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    797         "mthi     $zero,                $ac1                            \n\t"
    798         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    799         "mthi     $zero,                $ac2                            \n\t"
    800         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    801         "mthi     $zero,                $ac3                            \n\t"
    802 
    803         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    804         "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
    805         "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
    806         "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
    807 
    808         "extp     %[step1_20],          $ac0,           31              \n\t"
    809         "extp     %[step1_27],          $ac1,           31              \n\t"
    810         "extp     %[step1_21],          $ac2,           31              \n\t"
    811         "extp     %[step1_26],          $ac3,           31              \n\t"
    812 
    813         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
    814           [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
    815           [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
    816           [step1_26] "=&r"(step1_26)
    817         : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
    818           [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
    819           [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
    820 
    821     __asm__ __volatile__(
    822         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
    823         "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
    824         "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
    825         "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
    826 
    827         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    828         "mthi     $zero,                $ac0                            \n\t"
    829         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    830         "mthi     $zero,                $ac1                            \n\t"
    831         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    832         "mthi     $zero,                $ac2                            \n\t"
    833         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    834         "mthi     $zero,                $ac3                            \n\t"
    835 
    836         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    837         "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
    838         "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
    839         "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
    840 
    841         "extp     %[step1_22],          $ac0,           31              \n\t"
    842         "extp     %[step1_25],          $ac1,           31              \n\t"
    843         "extp     %[step1_23],          $ac2,           31              \n\t"
    844         "extp     %[step1_24],          $ac3,           31              \n\t"
    845 
    846         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
    847           [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
    848           [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
    849           [step1_24] "=&r"(step1_24)
    850         : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
    851           [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
    852           [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
    853 
    854     // final stage
    855     __asm__ __volatile__(
    856         "add      %[temp0],            %[step1_0],    %[step2_31]     \n\t"
    857         "add      %[temp1],            %[step1_1],    %[step2_30]     \n\t"
    858         "add      %[temp2],            %[step1_2],    %[step2_29]     \n\t"
    859         "add      %[temp3],            %[step1_3],    %[step2_28]     \n\t"
    860         "sub      %[load1],            %[step1_3],    %[step2_28]     \n\t"
    861         "sub      %[load2],            %[step1_2],    %[step2_29]     \n\t"
    862         "sub      %[load3],            %[step1_1],    %[step2_30]     \n\t"
    863         "sub      %[load4],            %[step1_0],    %[step2_31]     \n\t"
    864         "sh       %[temp0],          0(%[output])                     \n\t"
    865         "sh       %[temp1],         64(%[output])                     \n\t"
    866         "sh       %[temp2],        128(%[output])                     \n\t"
    867         "sh       %[temp3],        192(%[output])                     \n\t"
    868         "sh       %[load1],       1792(%[output])                     \n\t"
    869         "sh       %[load2],       1856(%[output])                     \n\t"
    870         "sh       %[load3],       1920(%[output])                     \n\t"
    871         "sh       %[load4],       1984(%[output])                     \n\t"
    872 
    873         : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
    874           [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
    875           [temp3] "=&r"(temp3), [load4] "=&r"(load4)
    876         : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31),
    877           [step1_1] "r"(step1_1), [step2_30] "r"(step2_30),
    878           [step1_2] "r"(step1_2), [step2_29] "r"(step2_29),
    879           [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
    880           [output] "r"(output));
    881 
    882     __asm__ __volatile__(
    883         "add      %[temp0],            %[step1_4],    %[step1_27]     \n\t"
    884         "add      %[temp1],            %[step1_5],    %[step1_26]     \n\t"
    885         "add      %[temp2],            %[step1_6],    %[step1_25]     \n\t"
    886         "add      %[temp3],            %[step1_7],    %[step1_24]     \n\t"
    887         "sub      %[load1],            %[step1_7],    %[step1_24]     \n\t"
    888         "sub      %[load2],            %[step1_6],    %[step1_25]     \n\t"
    889         "sub      %[load3],            %[step1_5],    %[step1_26]     \n\t"
    890         "sub      %[load4],            %[step1_4],    %[step1_27]     \n\t"
    891         "sh       %[temp0],        256(%[output])                     \n\t"
    892         "sh       %[temp1],        320(%[output])                     \n\t"
    893         "sh       %[temp2],        384(%[output])                     \n\t"
    894         "sh       %[temp3],        448(%[output])                     \n\t"
    895         "sh       %[load1],       1536(%[output])                     \n\t"
    896         "sh       %[load2],       1600(%[output])                     \n\t"
    897         "sh       %[load3],       1664(%[output])                     \n\t"
    898         "sh       %[load4],       1728(%[output])                     \n\t"
    899 
    900         : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
    901           [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
    902           [temp3] "=&r"(temp3), [load4] "=&r"(load4)
    903         : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27),
    904           [step1_5] "r"(step1_5), [step1_26] "r"(step1_26),
    905           [step1_6] "r"(step1_6), [step1_25] "r"(step1_25),
    906           [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
    907           [output] "r"(output));
    908 
    909     __asm__ __volatile__(
    910         "add      %[temp0],            %[step1_8],     %[step1_23]     \n\t"
    911         "add      %[temp1],            %[step1_9],     %[step1_22]     \n\t"
    912         "add      %[temp2],            %[step1_10],    %[step1_21]     \n\t"
    913         "add      %[temp3],            %[step1_11],    %[step1_20]     \n\t"
    914         "sub      %[load1],            %[step1_11],    %[step1_20]     \n\t"
    915         "sub      %[load2],            %[step1_10],    %[step1_21]     \n\t"
    916         "sub      %[load3],            %[step1_9],     %[step1_22]     \n\t"
    917         "sub      %[load4],            %[step1_8],     %[step1_23]     \n\t"
    918         "sh       %[temp0],        512(%[output])                      \n\t"
    919         "sh       %[temp1],        576(%[output])                      \n\t"
    920         "sh       %[temp2],        640(%[output])                      \n\t"
    921         "sh       %[temp3],        704(%[output])                      \n\t"
    922         "sh       %[load1],       1280(%[output])                      \n\t"
    923         "sh       %[load2],       1344(%[output])                      \n\t"
    924         "sh       %[load3],       1408(%[output])                      \n\t"
    925         "sh       %[load4],       1472(%[output])                      \n\t"
    926 
    927         : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
    928           [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
    929           [temp3] "=&r"(temp3), [load4] "=&r"(load4)
    930         : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23),
    931           [step1_9] "r"(step1_9), [step1_22] "r"(step1_22),
    932           [step1_10] "r"(step1_10), [step1_21] "r"(step1_21),
    933           [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
    934           [output] "r"(output));
    935 
    936     __asm__ __volatile__(
    937         "add      %[temp0],            %[step1_12],    %[step2_19]     \n\t"
    938         "add      %[temp1],            %[step1_13],    %[step2_18]     \n\t"
    939         "add      %[temp2],            %[step1_14],    %[step2_17]     \n\t"
    940         "add      %[temp3],            %[step1_15],    %[step2_16]     \n\t"
    941         "sub      %[load1],            %[step1_15],    %[step2_16]     \n\t"
    942         "sub      %[load2],            %[step1_14],    %[step2_17]     \n\t"
    943         "sub      %[load3],            %[step1_13],    %[step2_18]     \n\t"
    944         "sub      %[load4],            %[step1_12],    %[step2_19]     \n\t"
    945         "sh       %[temp0],        768(%[output])                      \n\t"
    946         "sh       %[temp1],        832(%[output])                      \n\t"
    947         "sh       %[temp2],        896(%[output])                      \n\t"
    948         "sh       %[temp3],        960(%[output])                      \n\t"
    949         "sh       %[load1],       1024(%[output])                      \n\t"
    950         "sh       %[load2],       1088(%[output])                      \n\t"
    951         "sh       %[load3],       1152(%[output])                      \n\t"
    952         "sh       %[load4],       1216(%[output])                      \n\t"
    953 
    954         : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
    955           [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
    956           [temp3] "=&r"(temp3), [load4] "=&r"(load4)
    957         : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19),
    958           [step1_13] "r"(step1_13), [step2_18] "r"(step2_18),
    959           [step1_14] "r"(step1_14), [step2_17] "r"(step2_17),
    960           [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
    961           [output] "r"(output));
    962 
    963     input += 32;
    964     output += 1;
    965   }
    966 }
    967 
    968 void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
    969                                   int stride) {
    970   DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
    971   int16_t *outptr = out;
    972   uint32_t pos = 45;
    973 
    974   /* bit positon for extract from acc */
    975   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    976                        :
    977                        : [pos] "r"(pos));
    978 
    979   // Rows
    980   idct32_rows_dspr2(input, outptr, 32);
    981 
    982   // Columns
    983   vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
    984 }
    985 
    986 void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
    987                                 int stride) {
    988   DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
    989   int16_t *outptr = out;
    990   uint32_t i;
    991   uint32_t pos = 45;
    992 
    993   /* bit positon for extract from acc */
    994   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    995                        :
    996                        : [pos] "r"(pos));
    997 
    998   // Rows
    999   idct32_rows_dspr2(input, outptr, 8);
   1000 
   1001   outptr += 8;
   1002   __asm__ __volatile__(
   1003       "sw     $zero,      0(%[outptr])     \n\t"
   1004       "sw     $zero,      4(%[outptr])     \n\t"
   1005       "sw     $zero,      8(%[outptr])     \n\t"
   1006       "sw     $zero,     12(%[outptr])     \n\t"
   1007       "sw     $zero,     16(%[outptr])     \n\t"
   1008       "sw     $zero,     20(%[outptr])     \n\t"
   1009       "sw     $zero,     24(%[outptr])     \n\t"
   1010       "sw     $zero,     28(%[outptr])     \n\t"
   1011       "sw     $zero,     32(%[outptr])     \n\t"
   1012       "sw     $zero,     36(%[outptr])     \n\t"
   1013       "sw     $zero,     40(%[outptr])     \n\t"
   1014       "sw     $zero,     44(%[outptr])     \n\t"
   1015 
   1016       :
   1017       : [outptr] "r"(outptr));
   1018 
   1019   for (i = 0; i < 31; ++i) {
   1020     outptr += 32;
   1021 
   1022     __asm__ __volatile__(
   1023         "sw     $zero,      0(%[outptr])     \n\t"
   1024         "sw     $zero,      4(%[outptr])     \n\t"
   1025         "sw     $zero,      8(%[outptr])     \n\t"
   1026         "sw     $zero,     12(%[outptr])     \n\t"
   1027         "sw     $zero,     16(%[outptr])     \n\t"
   1028         "sw     $zero,     20(%[outptr])     \n\t"
   1029         "sw     $zero,     24(%[outptr])     \n\t"
   1030         "sw     $zero,     28(%[outptr])     \n\t"
   1031         "sw     $zero,     32(%[outptr])     \n\t"
   1032         "sw     $zero,     36(%[outptr])     \n\t"
   1033         "sw     $zero,     40(%[outptr])     \n\t"
   1034         "sw     $zero,     44(%[outptr])     \n\t"
   1035 
   1036         :
   1037         : [outptr] "r"(outptr));
   1038   }
   1039 
   1040   // Columns
   1041   vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
   1042 }
   1043 
   1044 void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
   1045                                int stride) {
   1046   int r, out;
   1047   int32_t a1, absa1;
   1048   int32_t vector_a1;
   1049   int32_t t1, t2, t3, t4;
   1050   int32_t vector_1, vector_2, vector_3, vector_4;
   1051   uint32_t pos = 45;
   1052 
   1053   /* bit positon for extract from acc */
   1054   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
   1055 
   1056                        :
   1057                        : [pos] "r"(pos));
   1058 
   1059   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
   1060   __asm__ __volatile__(
   1061       "addi     %[out],    %[out],    32      \n\t"
   1062       "sra      %[a1],     %[out],    6       \n\t"
   1063 
   1064       : [out] "+r"(out), [a1] "=r"(a1)
   1065       :);
   1066 
   1067   if (a1 < 0) {
   1068     /* use quad-byte
   1069      * input and output memory are four byte aligned */
   1070     __asm__ __volatile__(
   1071         "abs        %[absa1],     %[a1]         \n\t"
   1072         "replv.qb   %[vector_a1], %[absa1]      \n\t"
   1073 
   1074         : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1)
   1075         : [a1] "r"(a1));
   1076 
   1077     for (r = 32; r--;) {
   1078       __asm__ __volatile__(
   1079           "lw             %[t1],          0(%[dest])                      \n\t"
   1080           "lw             %[t2],          4(%[dest])                      \n\t"
   1081           "lw             %[t3],          8(%[dest])                      \n\t"
   1082           "lw             %[t4],          12(%[dest])                     \n\t"
   1083           "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
   1084           "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
   1085           "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
   1086           "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
   1087           "sw             %[vector_1],    0(%[dest])                      \n\t"
   1088           "sw             %[vector_2],    4(%[dest])                      \n\t"
   1089           "sw             %[vector_3],    8(%[dest])                      \n\t"
   1090           "sw             %[vector_4],    12(%[dest])                     \n\t"
   1091 
   1092           "lw             %[t1],          16(%[dest])                     \n\t"
   1093           "lw             %[t2],          20(%[dest])                     \n\t"
   1094           "lw             %[t3],          24(%[dest])                     \n\t"
   1095           "lw             %[t4],          28(%[dest])                     \n\t"
   1096           "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
   1097           "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
   1098           "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
   1099           "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
   1100           "sw             %[vector_1],    16(%[dest])                     \n\t"
   1101           "sw             %[vector_2],    20(%[dest])                     \n\t"
   1102           "sw             %[vector_3],    24(%[dest])                     \n\t"
   1103           "sw             %[vector_4],    28(%[dest])                     \n\t"
   1104 
   1105           "add            %[dest],        %[dest],        %[stride]       \n\t"
   1106 
   1107           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
   1108             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
   1109             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
   1110             [dest] "+&r"(dest)
   1111           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
   1112     }
   1113   } else if (a1 > 255) {
   1114     int32_t a11, a12, vector_a11, vector_a12;
   1115 
   1116     /* use quad-byte
   1117      * input and output memory are four byte aligned */
   1118     a11 = a1 >> 1;
   1119     a12 = a1 - a11;
   1120     __asm__ __volatile__(
   1121         "replv.qb       %[vector_a11],  %[a11]     \n\t"
   1122         "replv.qb       %[vector_a12],  %[a12]     \n\t"
   1123 
   1124         : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
   1125         : [a11] "r"(a11), [a12] "r"(a12));
   1126 
   1127     for (r = 32; r--;) {
   1128       __asm__ __volatile__(
   1129           "lw             %[t1],          0(%[dest])                      \n\t"
   1130           "lw             %[t2],          4(%[dest])                      \n\t"
   1131           "lw             %[t3],          8(%[dest])                      \n\t"
   1132           "lw             %[t4],          12(%[dest])                     \n\t"
   1133           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
   1134           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
   1135           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
   1136           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
   1137           "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
   1138           "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
   1139           "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
   1140           "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
   1141           "sw             %[vector_1],    0(%[dest])                      \n\t"
   1142           "sw             %[vector_2],    4(%[dest])                      \n\t"
   1143           "sw             %[vector_3],    8(%[dest])                      \n\t"
   1144           "sw             %[vector_4],    12(%[dest])                     \n\t"
   1145 
   1146           "lw             %[t1],          16(%[dest])                     \n\t"
   1147           "lw             %[t2],          20(%[dest])                     \n\t"
   1148           "lw             %[t3],          24(%[dest])                     \n\t"
   1149           "lw             %[t4],          28(%[dest])                     \n\t"
   1150           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]    \n\t"
   1151           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]    \n\t"
   1152           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]    \n\t"
   1153           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]    \n\t"
   1154           "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
   1155           "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
   1156           "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
   1157           "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
   1158           "sw             %[vector_1],    16(%[dest])                     \n\t"
   1159           "sw             %[vector_2],    20(%[dest])                     \n\t"
   1160           "sw             %[vector_3],    24(%[dest])                     \n\t"
   1161           "sw             %[vector_4],    28(%[dest])                     \n\t"
   1162 
   1163           "add            %[dest],        %[dest],        %[stride]       \n\t"
   1164 
   1165           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
   1166             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
   1167             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
   1168             [dest] "+&r"(dest)
   1169           : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
   1170             [vector_a12] "r"(vector_a12));
   1171     }
   1172   } else {
   1173     /* use quad-byte
   1174      * input and output memory are four byte aligned */
   1175     __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
   1176 
   1177                          : [vector_a1] "=&r"(vector_a1)
   1178                          : [a1] "r"(a1));
   1179 
   1180     for (r = 32; r--;) {
   1181       __asm__ __volatile__(
   1182           "lw             %[t1],          0(%[dest])                      \n\t"
   1183           "lw             %[t2],          4(%[dest])                      \n\t"
   1184           "lw             %[t3],          8(%[dest])                      \n\t"
   1185           "lw             %[t4],          12(%[dest])                     \n\t"
   1186           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
   1187           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
   1188           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
   1189           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
   1190           "sw             %[vector_1],    0(%[dest])                      \n\t"
   1191           "sw             %[vector_2],    4(%[dest])                      \n\t"
   1192           "sw             %[vector_3],    8(%[dest])                      \n\t"
   1193           "sw             %[vector_4],    12(%[dest])                     \n\t"
   1194 
   1195           "lw             %[t1],          16(%[dest])                     \n\t"
   1196           "lw             %[t2],          20(%[dest])                     \n\t"
   1197           "lw             %[t3],          24(%[dest])                     \n\t"
   1198           "lw             %[t4],          28(%[dest])                     \n\t"
   1199           "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
   1200           "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
   1201           "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
   1202           "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
   1203           "sw             %[vector_1],    16(%[dest])                     \n\t"
   1204           "sw             %[vector_2],    20(%[dest])                     \n\t"
   1205           "sw             %[vector_3],    24(%[dest])                     \n\t"
   1206           "sw             %[vector_4],    28(%[dest])                     \n\t"
   1207 
   1208           "add            %[dest],        %[dest],        %[stride]       \n\t"
   1209 
   1210           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
   1211             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
   1212             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
   1213             [dest] "+&r"(dest)
   1214           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
   1215     }
   1216   }
   1217 }
   1218 #endif  // #if HAVE_DSPR2
   1219