Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
     12 #define VPX_DSP_MIPS_INV_TXFM_MSA_H_
     13 
     14 #include "vpx_dsp/mips/macros_msa.h"
     15 #include "vpx_dsp/mips/txfm_macros_msa.h"
     16 #include "vpx_dsp/txfm_common.h"
     17 
     18 #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
     19                   out3, out4, out5, out6, out7)                              \
     20   {                                                                          \
     21     v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
     22     v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
     23     v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
     24                        cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
     25     v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
     26                        cospi_24_64, -cospi_24_64, 0,           0 };          \
     27                                                                              \
     28     SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
     29     cnst2_m = -cnst0_m;                                                      \
     30     ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
     31     SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
     32     cnst4_m = -cnst2_m;                                                      \
     33     ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
     34                                                                              \
     35     ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
     36     ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
     37     DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
     38                           cnst2_m, cnst3_m, in7, in0, in4, in3);             \
     39                                                                              \
     40     SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
     41     cnst2_m = -cnst0_m;                                                      \
     42     ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
     43     SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
     44     cnst4_m = -cnst2_m;                                                      \
     45     ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
     46                                                                              \
     47     ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
     48     ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
     49                                                                              \
     50     DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
     51                           cnst2_m, cnst3_m, in5, in2, in6, in1);             \
     52     BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
     53     out7 = -s0_m;                                                            \
     54     out0 = s1_m;                                                             \
     55                                                                              \
     56     SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
     57                                                                              \
     58     ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
     59     cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
     60     cnst1_m = cnst0_m;                                                       \
     61                                                                              \
     62     ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
     63     ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
     64     DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
     65                           cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
     66                                                                              \
     67     SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
     68     cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
     69                                                                              \
     70     ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
     71     ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
     72     out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
     73     out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
     74     out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
     75     out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
     76                                                                              \
     77     out1 = -out1;                                                            \
     78     out3 = -out3;                                                            \
     79     out5 = -out5;                                                            \
     80   }
     81 
     82 #define VP9_SET_COSPI_PAIR(c0_h, c1_h)  \
     83   ({                                    \
     84     v8i16 out0_m, r0_m, r1_m;           \
     85                                         \
     86     r0_m = __msa_fill_h(c0_h);          \
     87     r1_m = __msa_fill_h(c1_h);          \
     88     out0_m = __msa_ilvev_h(r1_m, r0_m); \
     89                                         \
     90     out0_m;                             \
     91   })
     92 
     93 #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)               \
     94   {                                                                            \
     95     uint8_t *dst_m = (uint8_t *)(dst);                                         \
     96     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                                      \
     97     v16i8 tmp0_m, tmp1_m;                                                      \
     98     v16i8 zero_m = { 0 };                                                      \
     99     v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
    100                                                                                \
    101     LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);                 \
    102     ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
    103                res0_m, res1_m, res2_m, res3_m);                                \
    104     ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m,   \
    105          res2_m, res3_m);                                                      \
    106     CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                            \
    107     PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);               \
    108     ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                               \
    109   }
    110 
    111 #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)             \
    112   {                                                                         \
    113     v8i16 c0_m, c1_m, c2_m, c3_m;                                           \
    114     v8i16 step0_m, step1_m;                                                 \
    115     v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
    116                                                                             \
    117     c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                    \
    118     c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                   \
    119     step0_m = __msa_ilvr_h(in2, in0);                                       \
    120     DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);              \
    121                                                                             \
    122     c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                    \
    123     c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                     \
    124     step1_m = __msa_ilvr_h(in3, in1);                                       \
    125     DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);              \
    126     SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);            \
    127                                                                             \
    128     PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);            \
    129     SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                        \
    130     BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
    131                 out0, out1, out2, out3);                                    \
    132   }
    133 
    134 #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
    135   {                                                                    \
    136     v8i16 res0_m, res1_m, c0_m, c1_m;                                  \
    137     v8i16 k1_m, k2_m, k3_m, k4_m;                                      \
    138     v8i16 zero_m = { 0 };                                              \
    139     v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
    140     v4i32 int0_m, int1_m, int2_m, int3_m;                              \
    141     v8i16 mask_m = { sinpi_1_9,  sinpi_2_9,  sinpi_3_9,  sinpi_4_9,    \
    142                      -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
    143                                                                        \
    144     SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);          \
    145     ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                   \
    146     ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
    147     DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);           \
    148     int0_m = tmp2_m + tmp1_m;                                          \
    149                                                                        \
    150     SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                            \
    151     ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                   \
    152     DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
    153     int1_m = tmp0_m + tmp1_m;                                          \
    154                                                                        \
    155     c0_m = __msa_splati_h(mask_m, 6);                                  \
    156     ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                  \
    157     ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
    158     DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
    159     int2_m = tmp0_m + tmp1_m;                                          \
    160                                                                        \
    161     c0_m = __msa_splati_h(mask_m, 6);                                  \
    162     c0_m = __msa_ilvev_h(c0_m, k1_m);                                  \
    163                                                                        \
    164     res0_m = __msa_ilvr_h((in1), (in3));                               \
    165     tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                             \
    166     int3_m = tmp2_m + tmp0_m;                                          \
    167                                                                        \
    168     res0_m = __msa_ilvr_h((in2), (in3));                               \
    169     c1_m = __msa_ilvev_h(k4_m, k3_m);                                  \
    170                                                                        \
    171     tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                             \
    172     res1_m = __msa_ilvr_h((in0), (in2));                               \
    173     c1_m = __msa_ilvev_h(k1_m, zero_m);                                \
    174                                                                        \
    175     tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                             \
    176     int3_m += tmp2_m;                                                  \
    177     int3_m += tmp3_m;                                                  \
    178                                                                        \
    179     SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);       \
    180     PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);           \
    181     PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);           \
    182   }
    183 
    184 #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)    \
    185   ({                                                  \
    186     v8i16 c0_m, c1_m;                                 \
    187                                                       \
    188     SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
    189     c0_m = __msa_ilvev_h(c1_m, c0_m);                 \
    190                                                       \
    191     c0_m;                                             \
    192   })
    193 
    194 /* multiply and add macro */
    195 #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,  \
    196                  out2, out3)                                                  \
    197   {                                                                           \
    198     v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
    199     v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd;                         \
    200                                                                               \
    201     ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
    202     ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
    203     DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
    204                 cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
    205     SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
    206     PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1);      \
    207     DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
    208                 cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
    209     SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
    210     PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3);      \
    211   }
    212 
    213 /* idct 8x8 macro */
    214 #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
    215                        out2, out3, out4, out5, out6, out7)                    \
    216   {                                                                           \
    217     v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;             \
    218     v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;             \
    219     v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
    220     v8i16 mask_m = { cospi_28_64, cospi_4_64,  cospi_20_64,  cospi_12_64,     \
    221                      cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };  \
    222                                                                               \
    223     k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                  \
    224     k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                  \
    225     k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                  \
    226     k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                  \
    227     VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
    228     SUB2(in1, in3, in7, in5, res0_m, res1_m);                                 \
    229     k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                  \
    230     k1_m = __msa_splati_h(mask_m, 4);                                         \
    231                                                                               \
    232     ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                              \
    233     DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,       \
    234                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
    235     SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
    236     tp4_m = in1 + in3;                                                        \
    237     PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                \
    238     tp7_m = in7 + in5;                                                        \
    239     k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                      \
    240     k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                       \
    241     VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
    242     BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);              \
    243     BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
    244                 out1, out2, out3, out4, out5, out6, out7);                    \
    245   }
    246 
    247 #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
    248                         out2, out3, out4, out5, out6, out7)                   \
    249   {                                                                           \
    250     v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                     \
    251     v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                 \
    252     v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;           \
    253     v8i16 mask1_m = { cospi_2_64,  cospi_30_64,  -cospi_2_64, cospi_10_64,    \
    254                       cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
    255     v8i16 mask2_m = { cospi_14_64,  -cospi_18_64, cospi_26_64, cospi_6_64,    \
    256                       -cospi_26_64, cospi_8_64,   cospi_24_64, -cospi_8_64 }; \
    257     v8i16 mask3_m = {                                                         \
    258       -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0         \
    259     };                                                                        \
    260                                                                               \
    261     k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                 \
    262     k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                 \
    263     ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                      \
    264     DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
    265                 r1_m, r2_m, r3_m);                                            \
    266     k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                 \
    267     k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                 \
    268     ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                      \
    269     DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
    270                 r5_m, r6_m, r7_m);                                            \
    271     ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
    272          m3_m);                                                               \
    273     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    274     PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                      \
    275     SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
    276          m3_m);                                                               \
    277     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    278     PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                          \
    279     k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                 \
    280     k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                 \
    281     ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                      \
    282     DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
    283                 r1_m, r2_m, r3_m);                                            \
    284     k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                 \
    285     k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                 \
    286     ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                      \
    287     DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
    288                 r5_m, r6_m, r7_m);                                            \
    289     ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
    290          m3_m);                                                               \
    291     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    292     PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                      \
    293     SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
    294          m3_m);                                                               \
    295     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    296     PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                          \
    297     ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                      \
    298     BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);         \
    299     k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                 \
    300     k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                 \
    301     ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                    \
    302     DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
    303                 r1_m, r2_m, r3_m);                                            \
    304     k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                 \
    305     DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m,   \
    306                 r6_m, r7_m);                                                  \
    307     ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
    308          m3_m);                                                               \
    309     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    310     PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                           \
    311     SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
    312          m3_m);                                                               \
    313     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    314     PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                            \
    315     k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                 \
    316     k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                 \
    317     ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                      \
    318     DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m,     \
    319                 m1_m, m2_m, m3_m);                                            \
    320     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    321     PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                           \
    322     ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                        \
    323     DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m,   \
    324                 m2_m, m3_m);                                                  \
    325     SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
    326     PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                           \
    327                                                                               \
    328     out1 = -in1;                                                              \
    329     out3 = -in3;                                                              \
    330     out5 = -in5;                                                              \
    331     out7 = -in7;                                                              \
    332   }
    333 
    334 #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,     \
    335                          r12, r13, r14, r15, out0, out1, out2, out3, out4,     \
    336                          out5, out6, out7, out8, out9, out10, out11, out12,    \
    337                          out13, out14, out15)                                  \
    338   {                                                                            \
    339     v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;                      \
    340     v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;                \
    341     v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;                      \
    342     v8i16 h8_m, h9_m, h10_m, h11_m;                                            \
    343     v8i16 k0_m, k1_m, k2_m, k3_m;                                              \
    344                                                                                \
    345     /* stage 1 */                                                              \
    346     k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);                        \
    347     k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);                       \
    348     k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);                       \
    349     k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);                      \
    350     MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m);  \
    351     k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);                        \
    352     k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);                       \
    353     k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);                       \
    354     k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);                      \
    355     MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
    356     k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);                        \
    357     k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);                       \
    358     k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);                        \
    359     k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);                       \
    360     MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m,       \
    361             g11_m);                                                            \
    362     k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);                       \
    363     k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);                      \
    364     k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);                        \
    365     k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);                       \
    366     MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m,      \
    367             g15_m);                                                            \
    368                                                                                \
    369     /* stage 2 */                                                              \
    370     k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);                        \
    371     k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);                       \
    372     k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);                       \
    373     MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
    374             h3_m);                                                             \
    375     k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);                       \
    376     k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);                      \
    377     k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);                      \
    378     MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m,      \
    379             h6_m, h7_m);                                                       \
    380     BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);             \
    381     BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
    382                 h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);                         \
    383                                                                                \
    384     /* stage 3 */                                                              \
    385     BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);           \
    386     k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
    387     k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
    388     k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);                       \
    389     MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5,  \
    390             out7);                                                             \
    391     MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14,      \
    392             out13, out15);                                                     \
    393                                                                                \
    394     /* stage 4 */                                                              \
    395     k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                       \
    396     k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);                     \
    397     k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                      \
    398     k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);                      \
    399     MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                          \
    400     MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                            \
    401     MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);                        \
    402     MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);                        \
    403   }
    404 
    405 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
    406                                       int32_t dst_stride);
    407 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
    408 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
    409                                        int32_t dst_stride);
    410 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
    411 #endif  // VPX_DSP_MIPS_INV_TXFM_MSA_H_
    412