Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 
     13 #include "./vpx_config.h"
     14 #include "./vp9_rtcd.h"
     15 #include "vp9/common/vp9_common.h"
     16 #include "vp9/common/vp9_blockd.h"
     17 #include "vp9/common/vp9_idct.h"
     18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     19 
     20 #if HAVE_DSPR2
     21 void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     22                                       int dest_stride) {
     23   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
     24   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
     25   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
     26   int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
     27   int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
     28   int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
     29   int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
     30   int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
     31   int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
     32   int16_t step2_28, step2_29, step2_30, step2_31;
     33   int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
     34   int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
     35   int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
     36   int16_t step3_28, step3_29, step3_30, step3_31;
     37   int temp0, temp1, temp2, temp3;
     38   int load1, load2, load3, load4;
     39   int result1, result2;
     40   int i, temp21;
     41   uint8_t *dest_pix, *dest_pix1;
     42   const int const_2_power_13 = 8192;
     43   uint8_t *cm = vp9_ff_cropTbl;
     44 
     45   /* prefetch vp9_ff_cropTbl */
     46   vp9_prefetch_load(vp9_ff_cropTbl);
     47   vp9_prefetch_load(vp9_ff_cropTbl +  32);
     48   vp9_prefetch_load(vp9_ff_cropTbl +  64);
     49   vp9_prefetch_load(vp9_ff_cropTbl +  96);
     50   vp9_prefetch_load(vp9_ff_cropTbl + 128);
     51   vp9_prefetch_load(vp9_ff_cropTbl + 160);
     52   vp9_prefetch_load(vp9_ff_cropTbl + 192);
     53   vp9_prefetch_load(vp9_ff_cropTbl + 224);
     54 
     55   for (i = 0; i < 32; ++i) {
     56     dest_pix = dest + i;
     57     dest_pix1 = dest + i + 31 * dest_stride;
     58 
     59     __asm__ __volatile__ (
     60         "lh       %[load1],             2(%[input])                     \n\t"
     61         "lh       %[load2],             62(%[input])                    \n\t"
     62         "lh       %[load3],             34(%[input])                    \n\t"
     63         "lh       %[load4],             30(%[input])                    \n\t"
     64 
     65         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     66         "mthi     $zero,                $ac1                            \n\t"
     67         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
     68         "mthi     $zero,                $ac3                            \n\t"
     69 
     70         "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
     71         "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
     72         "extp     %[temp0],             $ac1,           31              \n\t"
     73 
     74         "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
     75         "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
     76         "extp     %[temp3],             $ac3,           31              \n\t"
     77 
     78         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     79         "mthi     $zero,                $ac1                            \n\t"
     80         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
     81         "mthi     $zero,                $ac2                            \n\t"
     82 
     83         "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
     84         "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
     85         "extp     %[temp1],             $ac2,           31              \n\t"
     86 
     87         "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
     88         "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
     89         "extp     %[temp2],             $ac1,           31              \n\t"
     90 
     91         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     92         "mthi     $zero,                $ac1                            \n\t"
     93         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
     94         "mthi     $zero,                $ac3                            \n\t"
     95 
     96         "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
     97         "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
     98 
     99         "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
    100         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
    101         "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
    102         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
    103 
    104         "extp     %[step1_17],          $ac1,           31              \n\t"
    105         "extp     %[step1_30],          $ac3,           31              \n\t"
    106         "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
    107         "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
    108 
    109         : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
    110           [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    111           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    112           [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
    113           [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
    114         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    115           [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
    116           [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
    117           [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
    118     );
    119 
    120     __asm__ __volatile__ (
    121         "lh       %[load1],             18(%[input])                    \n\t"
    122         "lh       %[load2],             46(%[input])                    \n\t"
    123         "lh       %[load3],             50(%[input])                    \n\t"
    124         "lh       %[load4],             14(%[input])                    \n\t"
    125 
    126         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    127         "mthi     $zero,                $ac1                            \n\t"
    128         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    129         "mthi     $zero,                $ac3                            \n\t"
    130 
    131         "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
    132         "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
    133         "extp     %[temp0],             $ac1,           31              \n\t"
    134 
    135         "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
    136         "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
    137         "extp     %[temp3],             $ac3,           31              \n\t"
    138 
    139         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    140         "mthi     $zero,                $ac1                            \n\t"
    141         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    142         "mthi     $zero,                $ac2                            \n\t"
    143 
    144         "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
    145         "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
    146         "extp     %[temp1],             $ac2,           31              \n\t"
    147 
    148         "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
    149         "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
    150         "extp     %[temp2],             $ac1,           31              \n\t"
    151 
    152         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    153         "mthi     $zero,                $ac1                            \n\t"
    154         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    155         "mthi     $zero,                $ac3                            \n\t"
    156 
    157         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
    158         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
    159 
    160         "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
    161         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
    162         "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
    163         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
    164 
    165         "extp     %[step1_18],          $ac1,           31              \n\t"
    166         "extp     %[step1_29],          $ac3,           31              \n\t"
    167         "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
    168         "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
    169 
    170         : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
    171           [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    172           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    173           [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
    174           [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
    175         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    176           [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
    177           [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
    178           [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
    179     );
    180 
    181     __asm__ __volatile__ (
    182         "lh       %[load1],             10(%[input])                    \n\t"
    183         "lh       %[load2],             54(%[input])                    \n\t"
    184         "lh       %[load3],             42(%[input])                    \n\t"
    185         "lh       %[load4],             22(%[input])                    \n\t"
    186 
    187         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    188         "mthi     $zero,                $ac1                            \n\t"
    189         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    190         "mthi     $zero,                $ac3                            \n\t"
    191 
    192         "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
    193         "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
    194         "extp     %[temp0],             $ac1,           31              \n\t"
    195 
    196         "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
    197         "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
    198         "extp     %[temp3],             $ac3,           31              \n\t"
    199 
    200         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    201         "mthi     $zero,                $ac1                            \n\t"
    202         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    203         "mthi     $zero,                $ac2                            \n\t"
    204 
    205         "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
    206         "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
    207         "extp     %[temp1],             $ac2,           31              \n\t"
    208 
    209         "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
    210         "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
    211         "extp     %[temp2],             $ac1,           31              \n\t"
    212 
    213         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    214         "mthi     $zero,                $ac1                            \n\t"
    215         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    216         "mthi     $zero,                $ac3                            \n\t"
    217 
    218         "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
    219         "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
    220 
    221         "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
    222         "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
    223         "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
    224         "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
    225 
    226         "extp     %[step1_21],          $ac1,           31              \n\t"
    227         "extp     %[step1_26],          $ac3,           31              \n\t"
    228         "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
    229         "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
    230 
    231         : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
    232           [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    233           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    234           [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
    235           [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
    236         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    237           [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
    238           [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
    239           [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
    240     );
    241 
    242     __asm__ __volatile__ (
    243         "lh       %[load1],             26(%[input])                    \n\t"
    244         "lh       %[load2],             38(%[input])                    \n\t"
    245         "lh       %[load3],             58(%[input])                    \n\t"
    246         "lh       %[load4],              6(%[input])                    \n\t"
    247 
    248         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    249         "mthi     $zero,                $ac1                            \n\t"
    250         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    251         "mthi     $zero,                $ac3                            \n\t"
    252 
    253         "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
    254         "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
    255         "extp     %[temp0],             $ac1,           31              \n\t"
    256         "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
    257         "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
    258         "extp     %[temp3],             $ac3,           31              \n\t"
    259 
    260         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    261         "mthi     $zero,                $ac1                            \n\t"
    262         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    263         "mthi     $zero,                $ac2                            \n\t"
    264 
    265         "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
    266         "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
    267         "extp     %[temp1],             $ac2,           31              \n\t"
    268         "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
    269         "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
    270         "extp     %[temp2],             $ac1,           31              \n\t"
    271 
    272         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    273         "mthi     $zero,                $ac1                            \n\t"
    274         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    275         "mthi     $zero,                $ac3                            \n\t"
    276 
    277         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
    278         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
    279         "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
    280         "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
    281         "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
    282         "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
    283         "extp     %[step1_22],          $ac1,           31              \n\t"
    284         "extp     %[step1_25],          $ac3,           31              \n\t"
    285         "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
    286         "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
    287 
    288         : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
    289           [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    290           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    291           [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
    292           [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
    293         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    294           [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
    295           [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
    296           [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
    297     );
    298 
    299     __asm__ __volatile__ (
    300         "lh       %[load1],              4(%[input])                    \n\t"
    301         "lh       %[load2],             60(%[input])                    \n\t"
    302         "lh       %[load3],             36(%[input])                    \n\t"
    303         "lh       %[load4],             28(%[input])                    \n\t"
    304 
    305         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    306         "mthi     $zero,                $ac1                            \n\t"
    307         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    308         "mthi     $zero,                $ac3                            \n\t"
    309 
    310         "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
    311         "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
    312         "extp     %[temp0],             $ac1,           31              \n\t"
    313         "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
    314         "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
    315         "extp     %[temp3],             $ac3,           31              \n\t"
    316 
    317         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    318         "mthi     $zero,                $ac1                            \n\t"
    319         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    320         "mthi     $zero,                $ac2                            \n\t"
    321 
    322         "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
    323         "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
    324         "extp     %[temp1],             $ac2,           31              \n\t"
    325         "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
    326         "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
    327         "extp     %[temp2],             $ac1,           31              \n\t"
    328 
    329         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    330         "mthi     $zero,                $ac1                            \n\t"
    331         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    332         "mthi     $zero,                $ac3                            \n\t"
    333 
    334         "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
    335         "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
    336         "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
    337         "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
    338         "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
    339         "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
    340         "extp     %[step2_9],           $ac1,           31              \n\t"
    341         "extp     %[step2_14],          $ac3,           31              \n\t"
    342         "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
    343         "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
    344 
    345         : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
    346           [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    347           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    348           [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
    349           [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
    350         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    351           [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
    352           [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
    353           [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
    354     );
    355 
    356     __asm__ __volatile__ (
    357         "lh       %[load1],             20(%[input])                    \n\t"
    358         "lh       %[load2],             44(%[input])                    \n\t"
    359         "lh       %[load3],             52(%[input])                    \n\t"
    360         "lh       %[load4],             12(%[input])                    \n\t"
    361 
    362         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    363         "mthi     $zero,                $ac1                            \n\t"
    364         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    365         "mthi     $zero,                $ac3                            \n\t"
    366 
    367         "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
    368         "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
    369         "extp     %[temp0],             $ac1,           31              \n\t"
    370         "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
    371         "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
    372         "extp     %[temp3],             $ac3,           31              \n\t"
    373 
    374         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    375         "mthi     $zero,                $ac1                            \n\t"
    376         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    377         "mthi     $zero,                $ac2                            \n\t"
    378 
    379         "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
    380         "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
    381         "extp     %[temp1],             $ac2,           31              \n\t"
    382         "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
    383         "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
    384         "extp     %[temp2],             $ac1,           31              \n\t"
    385 
    386         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    387         "mthi     $zero,                $ac1                            \n\t"
    388         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    389         "mthi     $zero,                $ac3                            \n\t"
    390 
    391         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
    392         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
    393         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
    394         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
    395         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
    396         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
    397         "extp     %[step2_10],          $ac1,           31              \n\t"
    398         "extp     %[step2_13],          $ac3,           31              \n\t"
    399         "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
    400         "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
    401 
    402         : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
    403           [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    404           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    405           [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
    406           [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
    407         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    408           [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
    409           [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
    410           [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
    411     );
    412 
    413     __asm__ __volatile__ (
    414         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    415         "mthi     $zero,                $ac0                            \n\t"
    416         "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
    417         "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
    418         "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
    419         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    420         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    421         "mthi     $zero,                $ac1                            \n\t"
    422         "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
    423         "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
    424         "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
    425         "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
    426         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    427         "mthi     $zero,                $ac2                            \n\t"
    428         "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
    429         "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
    430         "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
    431         "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
    432         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    433         "mthi     $zero,                $ac3                            \n\t"
    434         "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
    435         "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
    436         "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
    437         "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
    438 
    439         "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
    440         "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
    441         "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
    442         "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
    443         "extp     %[step3_10],          $ac0,           31              \n\t"
    444         "extp     %[step3_13],          $ac1,           31              \n\t"
    445         "extp     %[step3_11],          $ac2,           31              \n\t"
    446         "extp     %[step3_12],          $ac3,           31              \n\t"
    447 
    448         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    449           [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
    450           [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
    451           [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
    452           [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
    453         : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
    454           [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
    455           [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
    456           [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
    457           [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
    458     );
    459 
    460     step2_18 = step1_17 - step1_18;
    461     step2_29 = step1_30 - step1_29;
    462 
    463     __asm__ __volatile__ (
    464         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    465         "mthi     $zero,                $ac0                            \n\t"
    466         "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
    467         "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
    468         "extp     %[step3_18],          $ac0,           31              \n\t"
    469 
    470         : [step3_18] "=r" (step3_18)
    471         : [const_2_power_13] "r" (const_2_power_13),
    472           [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
    473           [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
    474     );
    475 
    476     temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
    477     step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    478 
    479     step2_19 = step1_16 - step1_19;
    480     step2_28 = step1_31 - step1_28;
    481 
    482     __asm__ __volatile__ (
    483         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    484         "mthi     $zero,                $ac0                            \n\t"
    485         "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
    486         "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
    487         "extp     %[step3_19],          $ac0,           31              \n\t"
    488 
    489         : [step3_19] "=r" (step3_19)
    490         : [const_2_power_13] "r" (const_2_power_13),
    491           [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
    492           [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
    493     );
    494 
    495     temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
    496     step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    497 
    498     step3_16 = step1_16 + step1_19;
    499     step3_17 = step1_17 + step1_18;
    500     step3_30 = step1_29 + step1_30;
    501     step3_31 = step1_28 + step1_31;
    502 
    503     step2_20 = step1_23 - step1_20;
    504     step2_27 = step1_24 - step1_27;
    505 
    506     __asm__ __volatile__ (
    507         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    508         "mthi     $zero,                $ac0                            \n\t"
    509         "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
    510         "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
    511         "extp     %[step3_20],          $ac0,           31              \n\t"
    512 
    513         : [step3_20] "=r" (step3_20)
    514         : [const_2_power_13] "r" (const_2_power_13),
    515           [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
    516           [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
    517     );
    518 
    519     temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
    520     step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    521 
    522     step2_21 = step1_22 - step1_21;
    523     step2_26 = step1_25 - step1_26;
    524 
    525     __asm__ __volatile__ (
    526         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    527         "mthi     $zero,                $ac1                            \n\t"
    528         "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
    529         "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
    530         "extp     %[step3_21],          $ac1,           31              \n\t"
    531 
    532         : [step3_21] "=r" (step3_21)
    533         : [const_2_power_13] "r" (const_2_power_13),
    534           [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
    535           [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
    536     );
    537 
    538     temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
    539     step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    540 
    541     step3_22 = step1_21 + step1_22;
    542     step3_23 = step1_20 + step1_23;
    543     step3_24 = step1_24 + step1_27;
    544     step3_25 = step1_25 + step1_26;
    545 
    546     step2_16 = step3_16 + step3_23;
    547     step2_17 = step3_17 + step3_22;
    548     step2_18 = step3_18 + step3_21;
    549     step2_19 = step3_19 + step3_20;
    550     step2_20 = step3_19 - step3_20;
    551     step2_21 = step3_18 - step3_21;
    552     step2_22 = step3_17 - step3_22;
    553     step2_23 = step3_16 - step3_23;
    554 
    555     step2_24 = step3_31 - step3_24;
    556     step2_25 = step3_30 - step3_25;
    557     step2_26 = step3_29 - step3_26;
    558     step2_27 = step3_28 - step3_27;
    559     step2_28 = step3_28 + step3_27;
    560     step2_29 = step3_29 + step3_26;
    561     step2_30 = step3_30 + step3_25;
    562     step2_31 = step3_31 + step3_24;
    563 
    564     __asm__ __volatile__ (
    565         "lh       %[load1],             0(%[input])                     \n\t"
    566         "lh       %[load2],             32(%[input])                    \n\t"
    567         "lh       %[load3],             16(%[input])                    \n\t"
    568         "lh       %[load4],             48(%[input])                    \n\t"
    569 
    570         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    571         "mthi     $zero,                $ac1                            \n\t"
    572         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    573         "mthi     $zero,                $ac2                            \n\t"
    574         "add      %[result1],           %[load1],       %[load2]        \n\t"
    575         "sub      %[result2],           %[load1],       %[load2]        \n\t"
    576         "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
    577         "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
    578         "extp     %[temp0],             $ac1,           31              \n\t"
    579         "extp     %[temp1],             $ac2,           31              \n\t"
    580 
    581         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    582         "mthi     $zero,                $ac3                            \n\t"
    583         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
    584         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
    585         "extp     %[temp2],             $ac3,           31              \n\t"
    586         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    587         "mthi     $zero,                $ac1                            \n\t"
    588         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
    589         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
    590         "extp     %[temp3],             $ac1,           31              \n\t"
    591         "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
    592         "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
    593         "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
    594         "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
    595 
    596         : [load1] "=&r" (load1), [load2] "=&r" (load2),
    597           [load3] "=&r" (load3), [load4] "=&r" (load4),
    598           [result1] "=&r" (result1), [result2] "=&r" (result2),
    599           [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    600           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    601           [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
    602           [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
    603         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    604           [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
    605           [cospi_16_64] "r" (cospi_16_64)
    606     );
    607 
    608     __asm__ __volatile__ (
    609         "lh       %[load1],             8(%[input])                     \n\t"
    610         "lh       %[load2],             56(%[input])                    \n\t"
    611         "lh       %[load3],             40(%[input])                    \n\t"
    612         "lh       %[load4],             24(%[input])                    \n\t"
    613 
    614         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    615         "mthi     $zero,                $ac1                            \n\t"
    616         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    617         "mthi     $zero,                $ac3                            \n\t"
    618 
    619         "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
    620         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
    621         "extp     %[temp0],             $ac1,           31              \n\t"
    622         "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
    623         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
    624         "extp     %[temp3],             $ac3,           31              \n\t"
    625 
    626         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    627         "mthi     $zero,                $ac1                            \n\t"
    628         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
    629         "mthi     $zero,                $ac2                            \n\t"
    630 
    631         "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
    632         "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
    633         "extp     %[temp1],             $ac2,           31              \n\t"
    634         "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
    635         "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
    636         "extp     %[temp2],             $ac1,           31              \n\t"
    637 
    638         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    639         "mthi     $zero,                $ac1                            \n\t"
    640         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
    641         "mthi     $zero,                $ac3                            \n\t"
    642 
    643         "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
    644         "sub      %[load1],             %[load1],       %[temp0]        \n\t"
    645         "add      %[load1],             %[load1],       %[temp1]        \n\t"
    646         "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
    647         "sub      %[load2],             %[load2],       %[temp2]        \n\t"
    648         "add      %[load2],             %[load2],       %[temp3]        \n\t"
    649         "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
    650         "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
    651 
    652         "extp     %[step1_5],           $ac1,           31              \n\t"
    653         "extp     %[step1_6],           $ac3,           31              \n\t"
    654         "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
    655         "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
    656 
    657         : [load1] "=&r" (load1), [load2] "=&r" (load2),
    658           [load3] "=&r" (load3), [load4] "=&r" (load4),
    659           [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
    660           [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    661           [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
    662           [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
    663         : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
    664           [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
    665           [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
    666           [cospi_16_64] "r" (cospi_16_64)
    667     );
    668 
    669     step2_0 = step1_0 + step1_7;
    670     step2_1 = step1_1 + step1_6;
    671     step2_2 = step1_2 + step1_5;
    672     step2_3 = step1_3 + step1_4;
    673     step2_4 = step1_3 - step1_4;
    674     step2_5 = step1_2 - step1_5;
    675     step2_6 = step1_1 - step1_6;
    676     step2_7 = step1_0 - step1_7;
    677 
    678     // stage 7
    679     step1_0 = step2_0 + step3_15;
    680     step1_1 = step2_1 + step3_14;
    681     step1_2 = step2_2 + step3_13;
    682     step1_3 = step2_3 + step3_12;
    683     step1_4 = step2_4 + step3_11;
    684     step1_5 = step2_5 + step3_10;
    685     step1_6 = step2_6 + step3_9;
    686     step1_7 = step2_7 + step3_8;
    687     step1_8 = step2_7 - step3_8;
    688     step1_9 = step2_6 - step3_9;
    689     step1_10 = step2_5 - step3_10;
    690     step1_11 = step2_4 - step3_11;
    691     step1_12 = step2_3 - step3_12;
    692     step1_13 = step2_2 - step3_13;
    693     step1_14 = step2_1 - step3_14;
    694     step1_15 = step2_0 - step3_15;
    695 
    696     __asm__ __volatile__ (
    697         "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
    698         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    699         "mthi     $zero,                $ac0                            \n\t"
    700         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    701         "extp     %[step1_20],          $ac0,           31              \n\t"
    702 
    703         : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
    704         : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
    705           [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
    706     );
    707 
    708     temp21 = (step2_20 + step2_27) * cospi_16_64;
    709     step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    710 
    711     __asm__ __volatile__ (
    712         "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
    713         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    714         "mthi     $zero,                $ac0                            \n\t"
    715         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    716         "extp     %[step1_21],          $ac0,           31              \n\t"
    717 
    718         : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
    719         : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
    720           [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
    721     );
    722 
    723     temp21 = (step2_21 + step2_26) * cospi_16_64;
    724     step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    725 
    726     __asm__ __volatile__ (
    727         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
    728         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    729         "mthi     $zero,                $ac0                            \n\t"
    730         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    731         "extp     %[step1_22],          $ac0,           31              \n\t"
    732 
    733         : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
    734         : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
    735           [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
    736     );
    737 
    738     temp21 = (step2_22 + step2_25) * cospi_16_64;
    739     step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    740 
    741     __asm__ __volatile__ (
    742         "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
    743         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    744         "mthi     $zero,                $ac0                            \n\t"
    745         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
    746         "extp     %[step1_23],          $ac0,           31              \n\t"
    747 
    748         : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
    749         : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
    750           [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
    751     );
    752 
    753     temp21 = (step2_23 + step2_24) * cospi_16_64;
    754     step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
    755 
    756     __asm__ __volatile__ (
    757         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    758         "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
    759         "addi     %[temp0],         %[temp0],           32              \n\t"
    760         "sra      %[temp0],         %[temp0],           6               \n\t"
    761         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    762         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    763         "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
    764         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
    765         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    766         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
    767         "addi     %[temp1],         %[temp1],           32              \n\t"
    768         "sra      %[temp1],         %[temp1],           6               \n\t"
    769         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
    770         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    771         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
    772         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    773 
    774         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    775         "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
    776         "addi     %[temp0],         %[temp0],           32              \n\t"
    777         "sra      %[temp0],         %[temp0],           6               \n\t"
    778         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    779         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    780         "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
    781         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
    782         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    783         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
    784         "addi     %[temp1],         %[temp1],           32              \n\t"
    785         "sra      %[temp1],         %[temp1],           6               \n\t"
    786         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
    787         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    788         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
    789         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    790 
    791         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    792           [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
    793         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
    794           [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
    795           [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
    796           [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
    797           [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
    798     );
    799 
    800     step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
    801     step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
    802     step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
    803     step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
    804 
    805     __asm__ __volatile__ (
    806         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
    807         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
    808         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    809         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
    810         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    811         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
    812         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
    813         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    814         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
    815         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    816 
    817         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
    818         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
    819         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    820         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
    821         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    822         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
    823         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
    824         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    825         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
    826         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    827 
    828         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    829           [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
    830         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
    831           [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
    832           [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
    833     );
    834 
    835     __asm__ __volatile__ (
    836         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    837         "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
    838         "addi     %[temp0],         %[temp0],           32              \n\t"
    839         "sra      %[temp0],         %[temp0],           6               \n\t"
    840         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    841         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    842         "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
    843         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
    844         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    845         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
    846         "addi     %[temp1],         %[temp1],           32              \n\t"
    847         "sra      %[temp1],         %[temp1],           6               \n\t"
    848         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
    849         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    850         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
    851         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    852 
    853         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    854         "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
    855         "addi     %[temp0],         %[temp0],           32              \n\t"
    856         "sra      %[temp0],         %[temp0],           6               \n\t"
    857         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    858         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    859         "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
    860         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
    861         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    862         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
    863         "addi     %[temp1],         %[temp1],           32              \n\t"
    864         "sra      %[temp1],         %[temp1],           6               \n\t"
    865         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
    866         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    867         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
    868         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    869 
    870         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    871           [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
    872         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
    873           [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
    874           [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
    875           [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
    876           [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
    877     );
    878 
    879     step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
    880     step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
    881     step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
    882     step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
    883 
    884     __asm__ __volatile__ (
    885         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
    886         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
    887         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    888         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
    889         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    890         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
    891         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
    892         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    893         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
    894         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    895 
    896         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
    897         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
    898         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    899         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
    900         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    901         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
    902         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
    903         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    904         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
    905         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    906 
    907         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    908           [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
    909         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
    910           [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
    911           [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
    912     );
    913 
    914     __asm__ __volatile__ (
    915         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    916         "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
    917         "addi     %[temp0],         %[temp0],           32              \n\t"
    918         "sra      %[temp0],         %[temp0],           6               \n\t"
    919         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    920         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    921         "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
    922         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
    923         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    924         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
    925         "addi     %[temp1],         %[temp1],           32              \n\t"
    926         "sra      %[temp1],         %[temp1],           6               \n\t"
    927         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
    928         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    929         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
    930         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    931 
    932         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    933         "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
    934         "addi     %[temp0],         %[temp0],           32              \n\t"
    935         "sra      %[temp0],         %[temp0],           6               \n\t"
    936         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    937         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    938         "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
    939         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
    940         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    941         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
    942         "addi     %[temp1],         %[temp1],           32              \n\t"
    943         "sra      %[temp1],         %[temp1],           6               \n\t"
    944         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
    945         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    946         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
    947         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
    948 
    949         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    950           [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
    951         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
    952           [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
    953           [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
    954           [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
    955           [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
    956     );
    957 
    958     step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
    959     step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
    960     step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
    961     step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
    962 
    963     __asm__ __volatile__ (
    964         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
    965         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
    966         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    967         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
    968         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    969         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
    970         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
    971         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    972         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
    973         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    974 
    975         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
    976         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
    977         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
    978         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
    979         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    980         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
    981         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
    982         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
    983         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
    984         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
    985 
    986         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    987           [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
    988         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
    989           [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
    990           [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
    991     );
    992 
    993     __asm__ __volatile__ (
    994         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
    995         "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
    996         "addi     %[temp0],         %[temp0],           32              \n\t"
    997         "sra      %[temp0],         %[temp0],           6               \n\t"
    998         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
    999         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
   1000         "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
   1001         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
   1002         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
   1003         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
   1004         "addi     %[temp1],         %[temp1],           32              \n\t"
   1005         "sra      %[temp1],         %[temp1],           6               \n\t"
   1006         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
   1007         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
   1008         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
   1009         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
   1010 
   1011         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
   1012         "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
   1013         "addi     %[temp0],         %[temp0],           32              \n\t"
   1014         "sra      %[temp0],         %[temp0],           6               \n\t"
   1015         "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
   1016         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
   1017         "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
   1018         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
   1019         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
   1020         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
   1021         "addi     %[temp1],         %[temp1],           32              \n\t"
   1022         "sra      %[temp1],         %[temp1],           6               \n\t"
   1023         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
   1024         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
   1025         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
   1026 
   1027         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
   1028           [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
   1029         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
   1030           [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
   1031           [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
   1032           [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
   1033           [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
   1034     );
   1035 
   1036     step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
   1037     step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
   1038     step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
   1039     step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
   1040 
   1041     __asm__ __volatile__ (
   1042         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
   1043         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
   1044         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
   1045         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
   1046         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
   1047         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
   1048         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
   1049         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
   1050         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
   1051         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
   1052 
   1053         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
   1054         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
   1055         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
   1056         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
   1057         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
   1058         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
   1059         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
   1060         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
   1061         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
   1062 
   1063         : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
   1064           [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
   1065         : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
   1066           [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
   1067           [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
   1068     );
   1069 
   1070     input += 32;
   1071   }
   1072 }
   1073 #endif  // #if HAVE_DSPR2
   1074