Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vp9/common/vp9_blockd.h"
     18 #include "vp9/common/vp9_idct.h"
     19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     20 
     21 #if HAVE_DSPR2
     22 static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
     23                              uint32_t no_rows) {
     24   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
     25   const int const_2_power_13 = 8192;
     26   int Temp0, Temp1, Temp2, Temp3, Temp4;
     27   int i;
     28 
     29   for (i = no_rows; i--; ) {
     30     __asm__ __volatile__ (
     31         /*
     32           temp_1 = (input[0] + input[4]) * cospi_16_64;
     33           step2_0 = dct_const_round_shift(temp_1);
     34 
     35           temp_2 = (input[0] - input[4]) * cospi_16_64;
     36           step2_1 = dct_const_round_shift(temp_2);
     37         */
     38         "lh       %[Temp0],             0(%[input])                     \n\t"
     39         "lh       %[Temp1],             8(%[input])                     \n\t"
     40         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
     41         "mthi     $zero,                $ac0                            \n\t"
     42         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     43         "mthi     $zero,                $ac1                            \n\t"
     44         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
     45         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
     46         "extp     %[Temp4],             $ac0,           31              \n\t"
     47 
     48         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
     49         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
     50         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
     51         "mthi     $zero,                $ac0                            \n\t"
     52         "extp     %[Temp2],             $ac1,           31              \n\t"
     53 
     54         /*
     55           temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
     56           step2_2 = dct_const_round_shift(temp_1);
     57         */
     58         "lh       %[Temp0],             4(%[input])                     \n\t"
     59         "lh       %[Temp1],             12(%[input])                    \n\t"
     60         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
     61         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
     62         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     63         "mthi     $zero,                $ac1                            \n\t"
     64         "extp     %[Temp3],             $ac0,           31              \n\t"
     65 
     66         /*
     67           step1_1 = step2_1 + step2_2;
     68           step1_2 = step2_1 - step2_2;
     69         */
     70         "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
     71         "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
     72 
     73         /*
     74           temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
     75           step2_3 = dct_const_round_shift(temp_2);
     76         */
     77         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
     78         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
     79         "extp     %[Temp1],             $ac1,           31              \n\t"
     80 
     81         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
     82         "mthi     $zero,                $ac0                            \n\t"
     83 
     84         /*
     85           step1_0 = step2_0 + step2_3;
     86           step1_3 = step2_0 - step2_3;
     87         */
     88         "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
     89         "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
     90 
     91         /*
     92           temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
     93           step1_4 = dct_const_round_shift(temp_1);
     94         */
     95         "lh       %[Temp0],             2(%[input])                     \n\t"
     96         "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
     97         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
     98         "mthi     $zero,                $ac1                            \n\t"
     99         "lh       %[Temp1],             14(%[input])                    \n\t"
    100         "lh       %[Temp0],             2(%[input])                     \n\t"
    101         "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
    102         "extp     %[step1_4],           $ac0,           31              \n\t"
    103 
    104         /*
    105           temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    106           step1_7 = dct_const_round_shift(temp_2);
    107         */
    108         "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
    109         "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
    110         "extp     %[step1_7],           $ac1,           31              \n\t"
    111 
    112         /*
    113           temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    114           step1_5 = dct_const_round_shift(temp_1);
    115         */
    116         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    117         "mthi     $zero,                $ac0                            \n\t"
    118         "lh       %[Temp0],             10(%[input])                    \n\t"
    119         "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
    120         "lh       %[Temp1],             6(%[input])                     \n\t"
    121         "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
    122         "extp     %[step1_5],           $ac0,           31              \n\t"
    123 
    124         /*
    125           temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    126           step1_6 = dct_const_round_shift(temp_2);
    127         */
    128         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    129         "mthi     $zero,                $ac1                            \n\t"
    130         "lh       %[Temp0],             10(%[input])                    \n\t"
    131         "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
    132         "lh       %[Temp1],             6(%[input])                     \n\t"
    133         "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
    134         "extp     %[step1_6],           $ac1,           31              \n\t"
    135 
    136         /*
    137           temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
    138           temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
    139         */
    140         "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
    141         "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
    142         "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
    143         "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
    144         "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
    145         "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
    146 
    147         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    148         "mthi     $zero,                $ac0                            \n\t"
    149         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    150         "mthi     $zero,                $ac1                            \n\t"
    151 
    152         "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
    153         "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
    154 
    155         /*
    156           step1_4 = step1_4 + step1_5;
    157           step1_7 = step1_6 + step1_7;
    158         */
    159         "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
    160         "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
    161 
    162         "extp     %[step1_5],           $ac0,           31              \n\t"
    163         "extp     %[step1_6],           $ac1,           31              \n\t"
    164 
    165         "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
    166         "sh       %[Temp0],             0(%[output])                    \n\t"
    167         "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
    168         "sh       %[Temp1],             16(%[output])                   \n\t"
    169         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
    170         "sh       %[Temp0],             32(%[output])                   \n\t"
    171         "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
    172         "sh       %[Temp1],             48(%[output])                   \n\t"
    173 
    174         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
    175         "sh       %[Temp0],             64(%[output])                   \n\t"
    176         "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
    177         "sh       %[Temp1],             80(%[output])                   \n\t"
    178         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
    179         "sh       %[Temp0],             96(%[output])                   \n\t"
    180         "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
    181         "sh       %[Temp1],             112(%[output])                  \n\t"
    182 
    183         : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
    184           [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
    185           [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
    186           [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
    187           [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
    188           [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    189           [Temp4] "=&r" (Temp4)
    190         : [const_2_power_13] "r" (const_2_power_13),
    191           [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
    192           [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
    193           [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
    194           [cospi_24_64] "r" (cospi_24_64),
    195           [output] "r" (output), [input] "r" (input)
    196     );
    197 
    198     input += 8;
    199     output += 1;
    200   }
    201 }
    202 
    203 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
    204                                         int dest_stride) {
    205   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
    206   int Temp0, Temp1, Temp2, Temp3;
    207   int i;
    208   const int const_2_power_13 = 8192;
    209   uint8_t *dest_pix;
    210   uint8_t *cm = vp9_ff_cropTbl;
    211 
    212   /* prefetch vp9_ff_cropTbl */
    213   vp9_prefetch_load(vp9_ff_cropTbl);
    214   vp9_prefetch_load(vp9_ff_cropTbl +  32);
    215   vp9_prefetch_load(vp9_ff_cropTbl +  64);
    216   vp9_prefetch_load(vp9_ff_cropTbl +  96);
    217   vp9_prefetch_load(vp9_ff_cropTbl + 128);
    218   vp9_prefetch_load(vp9_ff_cropTbl + 160);
    219   vp9_prefetch_load(vp9_ff_cropTbl + 192);
    220   vp9_prefetch_load(vp9_ff_cropTbl + 224);
    221 
    222   for (i = 0; i < 8; ++i) {
    223       dest_pix = (dest + i);
    224 
    225     __asm__ __volatile__ (
    226         /*
    227           temp_1 = (input[0] + input[4]) * cospi_16_64;
    228           step2_0 = dct_const_round_shift(temp_1);
    229 
    230           temp_2 = (input[0] - input[4]) * cospi_16_64;
    231           step2_1 = dct_const_round_shift(temp_2);
    232         */
    233         "lh       %[Temp0],             0(%[input])                     \n\t"
    234         "lh       %[Temp1],             8(%[input])                     \n\t"
    235         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    236         "mthi     $zero,                $ac0                            \n\t"
    237         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    238         "mthi     $zero,                $ac1                            \n\t"
    239         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
    240         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
    241         "extp     %[step1_6],           $ac0,           31              \n\t"
    242 
    243         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
    244         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
    245         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    246         "mthi     $zero,                $ac0                            \n\t"
    247         "extp     %[Temp2],             $ac1,           31              \n\t"
    248 
    249         /*
    250           temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
    251           step2_2 = dct_const_round_shift(temp_1);
    252         */
    253         "lh       %[Temp0],             4(%[input])                     \n\t"
    254         "lh       %[Temp1],             12(%[input])                    \n\t"
    255         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
    256         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
    257         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    258         "mthi     $zero,                $ac1                            \n\t"
    259         "extp     %[Temp3],             $ac0,           31              \n\t"
    260 
    261         /*
    262           step1_1 = step2_1 + step2_2;
    263           step1_2 = step2_1 - step2_2;
    264         */
    265         "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
    266         "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
    267 
    268         /*
    269           temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
    270           step2_3 = dct_const_round_shift(temp_2);
    271         */
    272         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
    273         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
    274         "extp     %[Temp1],             $ac1,           31              \n\t"
    275 
    276         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    277         "mthi     $zero,                $ac0                            \n\t"
    278 
    279         /*
    280           step1_0 = step2_0 + step2_3;
    281           step1_3 = step2_0 - step2_3;
    282         */
    283         "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
    284         "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
    285 
    286         /*
    287           temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    288           step1_4 = dct_const_round_shift(temp_1);
    289         */
    290         "lh       %[Temp0],             2(%[input])                     \n\t"
    291         "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
    292         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    293         "mthi     $zero,                $ac1                            \n\t"
    294         "lh       %[Temp1],             14(%[input])                    \n\t"
    295         "lh       %[Temp0],             2(%[input])                     \n\t"
    296         "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
    297         "extp     %[step1_4],           $ac0,           31              \n\t"
    298 
    299         /*
    300           temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    301           step1_7 = dct_const_round_shift(temp_2);
    302         */
    303         "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
    304         "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
    305         "extp     %[step1_7],           $ac1,           31              \n\t"
    306 
    307         /*
    308           temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    309           step1_5 = dct_const_round_shift(temp_1);
    310         */
    311         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    312         "mthi     $zero,                $ac0                            \n\t"
    313         "lh       %[Temp0],             10(%[input])                    \n\t"
    314         "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
    315         "lh       %[Temp1],             6(%[input])                     \n\t"
    316         "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
    317         "extp     %[step1_5],           $ac0,           31              \n\t"
    318 
    319         /*
    320           temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    321           step1_6 = dct_const_round_shift(temp_2);
    322         */
    323         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    324         "mthi     $zero,                $ac1                            \n\t"
    325         "lh       %[Temp0],             10(%[input])                    \n\t"
    326         "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
    327         "lh       %[Temp1],             6(%[input])                     \n\t"
    328         "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
    329         "extp     %[step1_6],           $ac1,           31              \n\t"
    330 
    331         /*
    332           temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
    333           temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
    334         */
    335         "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
    336         "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
    337         "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
    338         "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
    339         "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
    340         "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
    341 
    342         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
    343         "mthi     $zero,                $ac0                            \n\t"
    344         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
    345         "mthi     $zero,                $ac1                            \n\t"
    346 
    347         "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
    348         "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
    349 
    350         /*
    351           step1_4 = step1_4 + step1_5;
    352           step1_7 = step1_6 + step1_7;
    353         */
    354         "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
    355         "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
    356 
    357         "extp     %[step1_5],           $ac0,           31              \n\t"
    358         "extp     %[step1_6],           $ac1,           31              \n\t"
    359 
    360         /* add block */
    361         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    362         "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
    363         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    364         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    365         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    366         "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
    367         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    368         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    369         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    370 
    371         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    372         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    373         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    374         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    375         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
    376         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    377         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    378         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    379 
    380         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    381         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    382         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    383         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    384         "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
    385         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    386         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    387         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    388 
    389         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    390         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    391         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    392         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    393         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
    394         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    395         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    396         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    397 
    398         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    399         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    400         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    401         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    402         "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
    403         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    404         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    405         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    406 
    407         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    408         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    409         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    410         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    411         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
    412         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    413         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    414         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    415 
    416         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    417         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    418         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    419         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    420         "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
    421         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    422         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    423         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
    424 
    425         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
    426         "addi     %[Temp0],             %[Temp0],       16              \n\t"
    427         "sra      %[Temp0],             %[Temp0],       5               \n\t"
    428         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
    429         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
    430         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
    431 
    432         : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
    433           [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
    434           [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
    435           [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
    436           [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
    437           [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    438           [dest_pix] "+r" (dest_pix)
    439         : [const_2_power_13] "r" (const_2_power_13),
    440           [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
    441           [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
    442           [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
    443           [cospi_24_64] "r" (cospi_24_64),
    444           [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
    445     );
    446 
    447     input += 8;
    448   }
    449 }
    450 
    451 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
    452                               int dest_stride) {
    453   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
    454   int16_t *outptr = out;
    455   uint32_t pos = 45;
    456 
    457   /* bit positon for extract from acc */
    458   __asm__ __volatile__ (
    459     "wrdsp    %[pos],    1    \n\t"
    460     :
    461     : [pos] "r" (pos)
    462   );
    463 
    464   // First transform rows
    465   idct8_rows_dspr2(input, outptr, 8);
    466 
    467   // Then transform columns and add to dest
    468   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
    469 }
    470 
    471 static void iadst8_dspr2(const int16_t *input, int16_t *output) {
    472   int s0, s1, s2, s3, s4, s5, s6, s7;
    473   int x0, x1, x2, x3, x4, x5, x6, x7;
    474 
    475   x0 = input[7];
    476   x1 = input[0];
    477   x2 = input[5];
    478   x3 = input[2];
    479   x4 = input[3];
    480   x5 = input[4];
    481   x6 = input[1];
    482   x7 = input[6];
    483 
    484   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    485     output[0] = output[1] = output[2] = output[3] = output[4]
    486               = output[5] = output[6] = output[7] = 0;
    487     return;
    488   }
    489 
    490   // stage 1
    491   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
    492   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
    493   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    494   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    495   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    496   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    497   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
    498   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
    499 
    500   x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
    501   x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
    502   x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
    503   x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
    504   x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
    505   x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
    506   x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
    507   x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
    508 
    509   // stage 2
    510   s0 = x0;
    511   s1 = x1;
    512   s2 = x2;
    513   s3 = x3;
    514   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
    515   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
    516   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
    517   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
    518 
    519   x0 = s0 + s2;
    520   x1 = s1 + s3;
    521   x2 = s0 - s2;
    522   x3 = s1 - s3;
    523   x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
    524   x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
    525   x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
    526   x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
    527 
    528   // stage 3
    529   s2 = cospi_16_64 * (x2 + x3);
    530   s3 = cospi_16_64 * (x2 - x3);
    531   s6 = cospi_16_64 * (x6 + x7);
    532   s7 = cospi_16_64 * (x6 - x7);
    533 
    534   x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
    535   x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
    536   x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
    537   x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
    538 
    539   output[0] =  x0;
    540   output[1] = -x4;
    541   output[2] =  x6;
    542   output[3] = -x2;
    543   output[4] =  x3;
    544   output[5] = -x7;
    545   output[6] =  x5;
    546   output[7] = -x1;
    547 }
    548 
    549 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
    550                              int dest_stride, int tx_type) {
    551   int i, j;
    552   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
    553   int16_t *outptr = out;
    554   int16_t temp_in[8 * 8], temp_out[8];
    555   uint32_t pos = 45;
    556 
    557   /* bit positon for extract from acc */
    558   __asm__ __volatile__ (
    559     "wrdsp    %[pos],    1    \n\t"
    560     :
    561     : [pos] "r" (pos)
    562   );
    563 
    564   switch (tx_type) {
    565     case DCT_DCT:     // DCT in both horizontal and vertical
    566       idct8_rows_dspr2(input, outptr, 8);
    567       idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
    568       break;
    569     case ADST_DCT:    // ADST in vertical, DCT in horizontal
    570       idct8_rows_dspr2(input, outptr, 8);
    571 
    572       for (i = 0; i < 8; ++i) {
    573         iadst8_dspr2(&out[i * 8], temp_out);
    574 
    575         for (j = 0; j < 8; ++j)
    576           dest[j * dest_stride + i] =
    577                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
    578                                       + dest[j * dest_stride + i]);
    579       }
    580       break;
    581     case DCT_ADST:    // DCT in vertical, ADST in horizontal
    582       for (i = 0; i < 8; ++i) {
    583         iadst8_dspr2(input, outptr);
    584         input += 8;
    585         outptr += 8;
    586       }
    587 
    588       for (i = 0; i < 8; ++i) {
    589         for (j = 0; j < 8; ++j) {
    590           temp_in[i * 8 + j] = out[j * 8 + i];
    591         }
    592       }
    593       idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
    594       break;
    595     case ADST_ADST:   // ADST in both directions
    596       for (i = 0; i < 8; ++i) {
    597         iadst8_dspr2(input, outptr);
    598         input += 8;
    599         outptr += 8;
    600       }
    601 
    602       for (i = 0; i < 8; ++i) {
    603         for (j = 0; j < 8; ++j)
    604           temp_in[j] = out[j * 8 + i];
    605 
    606         iadst8_dspr2(temp_in, temp_out);
    607 
    608         for (j = 0; j < 8; ++j)
    609           dest[j * dest_stride + i] =
    610                 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
    611                                       + dest[j * dest_stride + i]);
    612       }
    613       break;
    614     default:
    615       printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
    616       break;
    617   }
    618 }
    619 
    620 void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
    621                               int dest_stride) {
    622   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
    623   int16_t *outptr = out;
    624   uint32_t pos = 45;
    625 
    626   /* bit positon for extract from acc */
    627   __asm__ __volatile__ (
    628     "wrdsp    %[pos],    1    \n\t"
    629     :
    630     : [pos] "r" (pos)
    631   );
    632 
    633   // First transform rows
    634   idct8_rows_dspr2(input, outptr, 4);
    635 
    636   outptr += 4;
    637 
    638   __asm__ __volatile__ (
    639       "sw  $zero,   0(%[outptr])  \n\t"
    640       "sw  $zero,   4(%[outptr])  \n\t"
    641       "sw  $zero,  16(%[outptr])  \n\t"
    642       "sw  $zero,  20(%[outptr])  \n\t"
    643       "sw  $zero,  32(%[outptr])  \n\t"
    644       "sw  $zero,  36(%[outptr])  \n\t"
    645       "sw  $zero,  48(%[outptr])  \n\t"
    646       "sw  $zero,  52(%[outptr])  \n\t"
    647       "sw  $zero,  64(%[outptr])  \n\t"
    648       "sw  $zero,  68(%[outptr])  \n\t"
    649       "sw  $zero,  80(%[outptr])  \n\t"
    650       "sw  $zero,  84(%[outptr])  \n\t"
    651       "sw  $zero,  96(%[outptr])  \n\t"
    652       "sw  $zero, 100(%[outptr])  \n\t"
    653       "sw  $zero, 112(%[outptr])  \n\t"
    654       "sw  $zero, 116(%[outptr])  \n\t"
    655 
    656       :
    657       : [outptr] "r" (outptr)
    658   );
    659 
    660 
    661   // Then transform columns and add to dest
    662   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
    663 }
    664 
    665 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
    666                              int dest_stride) {
    667   uint32_t pos = 45;
    668   int32_t out;
    669   int32_t r;
    670   int32_t a1, absa1;
    671   int32_t t1, t2, vector_a1, vector_1, vector_2;
    672 
    673   /* bit positon for extract from acc */
    674   __asm__ __volatile__ (
    675     "wrdsp      %[pos],     1           \n\t"
    676 
    677     :
    678     : [pos] "r" (pos)
    679   );
    680 
    681   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
    682   __asm__ __volatile__ (
    683       "addi     %[out],     %[out],     16      \n\t"
    684       "sra      %[a1],      %[out],     5       \n\t"
    685 
    686       : [out] "+r" (out), [a1] "=r" (a1)
    687       :
    688   );
    689 
    690   if (a1 < 0) {
    691     /* use quad-byte
    692      * input and output memory are four byte aligned */
    693     __asm__ __volatile__ (
    694         "abs        %[absa1],       %[a1]       \n\t"
    695         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
    696 
    697         : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
    698         : [a1] "r" (a1)
    699     );
    700 
    701     for (r = 8; r--;) {
    702       __asm__ __volatile__ (
    703           "lw           %[t1],          0(%[dest])                      \n\t"
    704           "lw           %[t2],          4(%[dest])                      \n\t"
    705           "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
    706           "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
    707           "sw           %[vector_1],    0(%[dest])                      \n\t"
    708           "sw           %[vector_2],    4(%[dest])                      \n\t"
    709           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
    710 
    711           : [t1] "=&r" (t1), [t2] "=&r" (t2),
    712             [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
    713             [dest] "+&r" (dest)
    714           : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
    715       );
    716     }
    717   } else {
    718     /* use quad-byte
    719      * input and output memory are four byte aligned */
    720     __asm__ __volatile__ (
    721         "replv.qb   %[vector_a1],   %[a1]   \n\t"
    722 
    723         : [vector_a1] "=r" (vector_a1)
    724         : [a1] "r" (a1)
    725     );
    726 
    727     for (r = 8; r--;) {
    728       __asm__ __volatile__ (
    729           "lw           %[t1],          0(%[dest])                      \n\t"
    730           "lw           %[t2],          4(%[dest])                      \n\t"
    731           "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
    732           "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
    733           "sw           %[vector_1],    0(%[dest])                      \n\t"
    734           "sw           %[vector_2],    4(%[dest])                      \n\t"
    735           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
    736 
    737           : [t1] "=&r" (t1), [t2] "=&r" (t2),
    738             [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
    739             [dest] "+r" (dest)
    740           : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
    741       );
    742     }
    743   }
    744 }
    745 #endif  // #if HAVE_DSPR2
    746