Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_dsp/mips/fwd_txfm_msa.h"
     12 
     13 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
     14                                               int32_t src_stride,
     15                                               int16_t *temp_buff) {
     16   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
     17   v8i16 step0, step1, step2, step3;
     18   v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
     19   v8i16 step0_1, step1_1, step2_1, step3_1;
     20 
     21   /* 1st and 2nd set */
     22   LD_SH4(input, src_stride, in0, in1, in2, in3);
     23   LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
     24   LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
     25   LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
     26   SLLI_4V(in0, in1, in2, in3, 2);
     27   SLLI_4V(in4, in5, in6, in7, 2);
     28   SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
     29   SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
     30   BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
     31               step3, in4, in5, in6, in7);
     32   BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
     33               step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
     34   ST_SH4(step0, step1, step2, step3, temp_buff, 8);
     35   ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
     36   ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
     37   ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
     38 
     39   /* 3rd and 4th set */
     40   LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
     41   LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
     42   LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
     43   LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
     44   SLLI_4V(in0, in1, in2, in3, 2);
     45   SLLI_4V(in4, in5, in6, in7, 2);
     46   SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
     47   SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
     48   BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
     49               step3, in4, in5, in6, in7);
     50   BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
     51               step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
     52   ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
     53   ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
     54   ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
     55   ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
     56 }
     57 
     58 static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
     59   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
     60   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
     61   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
     62   v8i16 temp0, temp1;
     63 
     64   /* fdct even */
     65   LD_SH4(input, 8, in0, in1, in2, in3);
     66   LD_SH4(input + 96, 8, in12, in13, in14, in15);
     67   BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
     68               vec3, in12, in13, in14, in15);
     69   LD_SH4(input + 32, 8, in4, in5, in6, in7);
     70   LD_SH4(input + 64, 8, in8, in9, in10, in11);
     71   BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
     72               in8, in9, in10, in11);
     73 
     74   /* Stage 3 */
     75   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
     76   BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
     77   DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
     78   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     79   ST_SH(temp0, temp);
     80   ST_SH(temp1, temp + 512);
     81 
     82   DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
     83   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     84   ST_SH(temp0, temp + 256);
     85   ST_SH(temp1, temp + 768);
     86 
     87   SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
     88   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
     89   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
     90   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
     91   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     92   ST_SH(temp0, temp + 128);
     93   ST_SH(temp1, temp + 896);
     94 
     95   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
     96   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
     97   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     98   ST_SH(temp0, temp + 640);
     99   ST_SH(temp1, temp + 384);
    100 
    101   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    102   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    103   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    104   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    105   ADD2(in0, in1, in2, in3, vec0, vec7);
    106   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
    107   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    108   ST_SH(temp0, temp + 64);
    109   ST_SH(temp1, temp + 960);
    110 
    111   SUB2(in0, in1, in2, in3, in0, in2);
    112   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
    113   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    114   ST_SH(temp0, temp + 576);
    115   ST_SH(temp1, temp + 448);
    116 
    117   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    118   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    119   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
    120   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
    121   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    122   ST_SH(temp0, temp + 320);
    123   ST_SH(temp1, temp + 704);
    124 
    125   ADD2(in3, in2, in0, in1, vec3, vec4);
    126   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
    127   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    128   ST_SH(temp0, temp + 192);
    129   ST_SH(temp1, temp + 832);
    130 }
    131 
    132 static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
    133   v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
    134   v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
    135 
    136   in20 = LD_SH(input + 32);
    137   in21 = LD_SH(input + 40);
    138   in26 = LD_SH(input + 80);
    139   in27 = LD_SH(input + 88);
    140 
    141   DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
    142   DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
    143 
    144   in18 = LD_SH(input + 16);
    145   in19 = LD_SH(input + 24);
    146   in28 = LD_SH(input + 96);
    147   in29 = LD_SH(input + 104);
    148 
    149   vec4 = in19 - in20;
    150   ST_SH(vec4, input + 32);
    151   vec4 = in18 - in21;
    152   ST_SH(vec4, input + 40);
    153   vec4 = in29 - in26;
    154   ST_SH(vec4, input + 80);
    155   vec4 = in28 - in27;
    156   ST_SH(vec4, input + 88);
    157 
    158   in21 = in18 + in21;
    159   in20 = in19 + in20;
    160   in27 = in28 + in27;
    161   in26 = in29 + in26;
    162 
    163   LD_SH4(input + 48, 8, in22, in23, in24, in25);
    164   DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
    165   DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
    166 
    167   in16 = LD_SH(input);
    168   in17 = LD_SH(input + 8);
    169   in30 = LD_SH(input + 112);
    170   in31 = LD_SH(input + 120);
    171 
    172   vec4 = in17 - in22;
    173   ST_SH(vec4, input + 16);
    174   vec4 = in16 - in23;
    175   ST_SH(vec4, input + 24);
    176   vec4 = in31 - in24;
    177   ST_SH(vec4, input + 96);
    178   vec4 = in30 - in25;
    179   ST_SH(vec4, input + 104);
    180 
    181   ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
    182   DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
    183   DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
    184   ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
    185   DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
    186   ADD2(in27, in26, in25, in24, in23, in20);
    187   DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
    188   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    189   ST_SH(vec5, temp_ptr);
    190   ST_SH(vec4, temp_ptr + 960);
    191 
    192   SUB2(in27, in26, in25, in24, in22, in21);
    193   DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
    194   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    195   ST_SH(vec5, temp_ptr + 448);
    196   ST_SH(vec4, temp_ptr + 512);
    197 
    198   SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
    199   DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
    200   SUB2(in26, in27, in24, in25, in23, in20);
    201   DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
    202   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    203   ST_SH(vec4, temp_ptr + 704);
    204   ST_SH(vec5, temp_ptr + 256);
    205 
    206   ADD2(in26, in27, in24, in25, in22, in21);
    207   DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
    208   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    209   ST_SH(vec4, temp_ptr + 192);
    210   ST_SH(vec5, temp_ptr + 768);
    211 
    212   LD_SH4(input + 16, 8, in22, in23, in20, in21);
    213   LD_SH4(input + 80, 8, in26, in27, in24, in25);
    214   in16 = in20;
    215   in17 = in21;
    216   DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
    217   DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
    218   SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
    219   DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
    220   ADD2(in28, in29, in31, in30, in16, in19);
    221   DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
    222   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    223   ST_SH(vec5, temp_ptr + 832);
    224   ST_SH(vec4, temp_ptr + 128);
    225 
    226   SUB2(in28, in29, in31, in30, in17, in18);
    227   DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
    228   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    229   ST_SH(vec5, temp_ptr + 320);
    230   ST_SH(vec4, temp_ptr + 640);
    231   ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
    232   DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
    233   SUB2(in29, in28, in30, in31, in16, in19);
    234   DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
    235   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    236   ST_SH(vec5, temp_ptr + 576);
    237   ST_SH(vec4, temp_ptr + 384);
    238 
    239   ADD2(in29, in28, in30, in31, in17, in18);
    240   DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
    241   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    242   ST_SH(vec5, temp_ptr + 64);
    243   ST_SH(vec4, temp_ptr + 896);
    244 }
    245 
    246 static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
    247                                int16_t *tmp_buf, int16_t *tmp_buf_big) {
    248   fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
    249   fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
    250   fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
    251 }
    252 
    253 static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
    254                                            int16_t *output) {
    255   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    256   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    257   v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
    258 
    259   LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
    260   LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
    261   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
    262                      in4, in5, in6, in7);
    263   TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
    264                      in10, in11, in12, in13, in14, in15);
    265   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
    266                in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
    267                step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
    268   ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
    269   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
    270 
    271   /* 2nd set */
    272   LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
    273   LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
    274   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
    275                      in4, in5, in6, in7);
    276   TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
    277                      in10, in11, in12, in13, in14, in15);
    278   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
    279                in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
    280                step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
    281   ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
    282          (output + 8 * 8), 8);
    283   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
    284 }
    285 
    286 static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
    287                                     int16_t *out) {
    288   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    289   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    290   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    291   v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
    292   v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
    293   v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
    294 
    295   /* fdct32 even */
    296   /* stage 2 */
    297   LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    298   LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    299 
    300   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
    301                in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
    302                vec7, in8, in9, in10, in11, in12, in13, in14, in15);
    303   ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
    304   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
    305 
    306   /* Stage 3 */
    307   UNPCK_SH_SW(vec0, vec0_l, vec0_r);
    308   UNPCK_SH_SW(vec1, vec1_l, vec1_r);
    309   UNPCK_SH_SW(vec2, vec2_l, vec2_r);
    310   UNPCK_SH_SW(vec3, vec3_l, vec3_r);
    311   UNPCK_SH_SW(vec4, vec4_l, vec4_r);
    312   UNPCK_SH_SW(vec5, vec5_l, vec5_r);
    313   UNPCK_SH_SW(vec6, vec6_l, vec6_r);
    314   UNPCK_SH_SW(vec7, vec7_l, vec7_r);
    315   ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
    316        tmp1_w, tmp2_w, tmp3_w);
    317   BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
    318   ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
    319        vec1_r, vec2_r, vec3_r);
    320 
    321   tmp3_w = vec0_r + vec3_r;
    322   vec0_r = vec0_r - vec3_r;
    323   vec3_r = vec1_r + vec2_r;
    324   vec1_r = vec1_r - vec2_r;
    325 
    326   DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
    327                     vec4_r, tmp3_w, vec6_r, vec3_r);
    328   FDCT32_POSTPROC_NEG_W(vec4_r);
    329   FDCT32_POSTPROC_NEG_W(tmp3_w);
    330   FDCT32_POSTPROC_NEG_W(vec6_r);
    331   FDCT32_POSTPROC_NEG_W(vec3_r);
    332   PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
    333   ST_SH2(vec5, vec4, out, 8);
    334 
    335   DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
    336                     vec4_r, tmp3_w, vec6_r, vec3_r);
    337   FDCT32_POSTPROC_NEG_W(vec4_r);
    338   FDCT32_POSTPROC_NEG_W(tmp3_w);
    339   FDCT32_POSTPROC_NEG_W(vec6_r);
    340   FDCT32_POSTPROC_NEG_W(vec3_r);
    341   PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
    342   ST_SH2(vec5, vec4, out + 16, 8);
    343 
    344   LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
    345   SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
    346   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
    347   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
    348   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
    349   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    350   ST_SH(in4, out + 32);
    351   ST_SH(in5, out + 56);
    352 
    353   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
    354   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
    355   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    356   ST_SH(in4, out + 40);
    357   ST_SH(in5, out + 48);
    358 
    359   LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    360   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    361   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    362   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    363   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    364   ADD2(in0, in1, in2, in3, vec0, vec7);
    365   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
    366   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    367   ST_SH(in4, out + 64);
    368   ST_SH(in5, out + 120);
    369 
    370   SUB2(in0, in1, in2, in3, in0, in2);
    371   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
    372   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    373   ST_SH(in4, out + 72);
    374   ST_SH(in5, out + 112);
    375 
    376   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    377   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    378   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
    379   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
    380   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    381   ST_SH(in4, out + 80);
    382   ST_SH(in5, out + 104);
    383 
    384   ADD2(in3, in2, in0, in1, vec3, vec4);
    385   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
    386   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    387   ST_SH(in4, out + 96);
    388   ST_SH(in5, out + 88);
    389 }
    390 
    391 static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
    392   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    393   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    394   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
    395 
    396   /* fdct32 even */
    397   /* stage 2 */
    398   LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    399   LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    400 
    401   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
    402                in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
    403                vec7, in8, in9, in10, in11, in12, in13, in14, in15);
    404 
    405   /* Stage 3 */
    406   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
    407   BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
    408   DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
    409   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    410   ST_SH(temp0, out);
    411   ST_SH(temp1, out + 8);
    412 
    413   DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
    414   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    415   ST_SH(temp0, out + 16);
    416   ST_SH(temp1, out + 24);
    417 
    418   SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
    419   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
    420   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
    421   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
    422   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    423   ST_SH(temp0, out + 32);
    424   ST_SH(temp1, out + 56);
    425 
    426   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
    427   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
    428   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    429   ST_SH(temp0, out + 40);
    430   ST_SH(temp1, out + 48);
    431 
    432   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    433   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    434   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    435   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    436   ADD2(in0, in1, in2, in3, vec0, vec7);
    437   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
    438   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    439   ST_SH(temp0, out + 64);
    440   ST_SH(temp1, out + 120);
    441 
    442   SUB2(in0, in1, in2, in3, in0, in2);
    443   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
    444   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    445   ST_SH(temp0, out + 72);
    446   ST_SH(temp1, out + 112);
    447 
    448   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    449   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    450   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
    451   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
    452   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    453   ST_SH(temp0, out + 80);
    454   ST_SH(temp1, out + 104);
    455 
    456   ADD2(in3, in2, in0, in1, vec3, vec4);
    457   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
    458   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    459   ST_SH(temp0, out + 96);
    460   ST_SH(temp1, out + 88);
    461 }
    462 
    463 static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
    464                                 int16_t *out) {
    465   v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
    466   v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
    467 
    468   in20 = LD_SH(temp + 32);
    469   in21 = LD_SH(temp + 40);
    470   in26 = LD_SH(temp + 80);
    471   in27 = LD_SH(temp + 88);
    472 
    473   DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
    474   DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
    475 
    476   in18 = LD_SH(temp + 16);
    477   in19 = LD_SH(temp + 24);
    478   in28 = LD_SH(temp + 96);
    479   in29 = LD_SH(temp + 104);
    480 
    481   vec4 = in19 - in20;
    482   ST_SH(vec4, interm_ptr + 32);
    483   vec4 = in18 - in21;
    484   ST_SH(vec4, interm_ptr + 88);
    485   vec4 = in28 - in27;
    486   ST_SH(vec4, interm_ptr + 56);
    487   vec4 = in29 - in26;
    488   ST_SH(vec4, interm_ptr + 64);
    489 
    490   ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
    491 
    492   in22 = LD_SH(temp + 48);
    493   in23 = LD_SH(temp + 56);
    494   in24 = LD_SH(temp + 64);
    495   in25 = LD_SH(temp + 72);
    496 
    497   DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
    498   DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
    499 
    500   in16 = LD_SH(temp);
    501   in17 = LD_SH(temp + 8);
    502   in30 = LD_SH(temp + 112);
    503   in31 = LD_SH(temp + 120);
    504 
    505   vec4 = in17 - in22;
    506   ST_SH(vec4, interm_ptr + 40);
    507   vec4 = in30 - in25;
    508   ST_SH(vec4, interm_ptr + 48);
    509   vec4 = in31 - in24;
    510   ST_SH(vec4, interm_ptr + 72);
    511   vec4 = in16 - in23;
    512   ST_SH(vec4, interm_ptr + 80);
    513 
    514   ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
    515   DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
    516   DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
    517 
    518   ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
    519   DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
    520   ADD2(in27, in26, in25, in24, in23, in20);
    521 
    522   DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
    523   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    524   ST_SH(vec5, out);
    525   ST_SH(vec4, out + 120);
    526 
    527   SUB2(in27, in26, in25, in24, in22, in21);
    528 
    529   DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
    530   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    531   ST_SH(vec5, out + 112);
    532   ST_SH(vec4, out + 8);
    533 
    534   SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
    535   DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
    536   SUB2(in26, in27, in24, in25, in23, in20);
    537 
    538   DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
    539   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    540   ST_SH(vec4, out + 16);
    541   ST_SH(vec5, out + 104);
    542 
    543   ADD2(in26, in27, in24, in25, in22, in21);
    544   DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
    545   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    546   ST_SH(vec4, out + 24);
    547   ST_SH(vec5, out + 96);
    548 
    549   in20 = LD_SH(interm_ptr + 32);
    550   in21 = LD_SH(interm_ptr + 88);
    551   in27 = LD_SH(interm_ptr + 56);
    552   in26 = LD_SH(interm_ptr + 64);
    553 
    554   in16 = in20;
    555   in17 = in21;
    556   DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
    557   DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
    558 
    559   in22 = LD_SH(interm_ptr + 40);
    560   in25 = LD_SH(interm_ptr + 48);
    561   in24 = LD_SH(interm_ptr + 72);
    562   in23 = LD_SH(interm_ptr + 80);
    563 
    564   SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
    565   DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
    566   ADD2(in28, in29, in31, in30, in16, in19);
    567   DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
    568   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    569   ST_SH(vec5, out + 32);
    570   ST_SH(vec4, out + 88);
    571 
    572   SUB2(in28, in29, in31, in30, in17, in18);
    573   DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
    574   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    575   ST_SH(vec5, out + 40);
    576   ST_SH(vec4, out + 80);
    577 
    578   ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
    579   DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
    580   SUB2(in29, in28, in30, in31, in16, in19);
    581 
    582   DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
    583   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    584   ST_SH(vec5, out + 72);
    585   ST_SH(vec4, out + 48);
    586 
    587   ADD2(in29, in28, in30, in31, in17, in18);
    588 
    589   DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
    590   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    591   ST_SH(vec4, out + 56);
    592   ST_SH(vec5, out + 64);
    593 }
    594 
    595 static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
    596   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    597   v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
    598 
    599   /* 1st set */
    600   in0 = LD_SH(temp);
    601   in4 = LD_SH(temp + 32);
    602   in2 = LD_SH(temp + 64);
    603   in6 = LD_SH(temp + 96);
    604   in1 = LD_SH(temp + 128);
    605   in7 = LD_SH(temp + 152);
    606   in3 = LD_SH(temp + 192);
    607   in5 = LD_SH(temp + 216);
    608 
    609   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
    610                      in4, in5, in6, in7);
    611 
    612   /* 2nd set */
    613   in0_1 = LD_SH(temp + 16);
    614   in1_1 = LD_SH(temp + 232);
    615   in2_1 = LD_SH(temp + 80);
    616   in3_1 = LD_SH(temp + 168);
    617   in4_1 = LD_SH(temp + 48);
    618   in5_1 = LD_SH(temp + 176);
    619   in6_1 = LD_SH(temp + 112);
    620   in7_1 = LD_SH(temp + 240);
    621 
    622   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
    623   TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
    624                      in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
    625 
    626   /* 3rd set */
    627   in0 = LD_SH(temp + 8);
    628   in1 = LD_SH(temp + 136);
    629   in2 = LD_SH(temp + 72);
    630   in3 = LD_SH(temp + 200);
    631   in4 = LD_SH(temp + 40);
    632   in5 = LD_SH(temp + 208);
    633   in6 = LD_SH(temp + 104);
    634   in7 = LD_SH(temp + 144);
    635 
    636   ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
    637          32);
    638   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
    639                      in4, in5, in6, in7);
    640   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
    641 
    642   /* 4th set */
    643   in0_1 = LD_SH(temp + 24);
    644   in1_1 = LD_SH(temp + 224);
    645   in2_1 = LD_SH(temp + 88);
    646   in3_1 = LD_SH(temp + 160);
    647   in4_1 = LD_SH(temp + 56);
    648   in5_1 = LD_SH(temp + 184);
    649   in6_1 = LD_SH(temp + 120);
    650   in7_1 = LD_SH(temp + 248);
    651 
    652   TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
    653                      in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
    654   ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
    655          32);
    656 }
    657 
    658 static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
    659   fdct8x32_1d_row_load_butterfly(temp, temp_buf);
    660   fdct8x32_1d_row_even(temp_buf, temp_buf);
    661   fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
    662   fdct8x32_1d_row_transpose_store(temp_buf, output);
    663 }
    664 
    665 static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
    666                                int16_t *output) {
    667   fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
    668   fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
    669   fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
    670   fdct8x32_1d_row_transpose_store(tmp_buf, output);
    671 }
    672 
    673 void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
    674                        int32_t src_stride) {
    675   int32_t i;
    676   DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
    677   DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
    678 
    679   /* column transform */
    680   for (i = 0; i < 4; ++i) {
    681     fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
    682                        tmp_buf_big + (8 * i));
    683   }
    684 
    685   /* row transform */
    686   fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
    687 
    688   /* row transform */
    689   for (i = 1; i < 4; ++i) {
    690     fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
    691   }
    692 }
    693 
    694 static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
    695   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    696   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    697   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
    698 
    699   /* fdct32 even */
    700   /* stage 2 */
    701   LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    702   LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    703 
    704   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
    705                in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
    706                vec7, in8, in9, in10, in11, in12, in13, in14, in15);
    707   FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
    708   FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
    709   FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
    710   FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
    711   FDCT_POSTPROC_2V_NEG_H(in8, in9);
    712   FDCT_POSTPROC_2V_NEG_H(in10, in11);
    713   FDCT_POSTPROC_2V_NEG_H(in12, in13);
    714   FDCT_POSTPROC_2V_NEG_H(in14, in15);
    715 
    716   /* Stage 3 */
    717   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
    718 
    719   temp0 = in0 + in3;
    720   in0 = in0 - in3;
    721   in3 = in1 + in2;
    722   in1 = in1 - in2;
    723 
    724   DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
    725   ST_SH(temp0, out);
    726   ST_SH(temp1, out + 8);
    727 
    728   DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
    729   ST_SH(temp0, out + 16);
    730   ST_SH(temp1, out + 24);
    731 
    732   SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
    733   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
    734   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
    735   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
    736   ST_SH(temp0, out + 32);
    737   ST_SH(temp1, out + 56);
    738 
    739   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
    740   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
    741   ST_SH(temp0, out + 40);
    742   ST_SH(temp1, out + 48);
    743 
    744   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    745   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    746   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    747   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    748   ADD2(in0, in1, in2, in3, vec0, vec7);
    749   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
    750   ST_SH(temp0, out + 64);
    751   ST_SH(temp1, out + 120);
    752 
    753   SUB2(in0, in1, in2, in3, in0, in2);
    754   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
    755   ST_SH(temp0, out + 72);
    756   ST_SH(temp1, out + 112);
    757 
    758   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    759   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    760   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
    761   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
    762   ST_SH(temp0, out + 80);
    763   ST_SH(temp1, out + 104);
    764 
    765   ADD2(in3, in2, in0, in1, vec3, vec4);
    766   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
    767   ST_SH(temp0, out + 96);
    768   ST_SH(temp1, out + 88);
    769 }
    770 
    771 static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
    772                                    int16_t *out) {
    773   v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
    774   v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
    775   v8i16 vec4, vec5;
    776 
    777   in20 = LD_SH(temp + 32);
    778   in21 = LD_SH(temp + 40);
    779   in26 = LD_SH(temp + 80);
    780   in27 = LD_SH(temp + 88);
    781 
    782   DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
    783   DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
    784 
    785   FDCT_POSTPROC_2V_NEG_H(in20, in21);
    786   FDCT_POSTPROC_2V_NEG_H(in26, in27);
    787 
    788   in18 = LD_SH(temp + 16);
    789   in19 = LD_SH(temp + 24);
    790   in28 = LD_SH(temp + 96);
    791   in29 = LD_SH(temp + 104);
    792 
    793   FDCT_POSTPROC_2V_NEG_H(in18, in19);
    794   FDCT_POSTPROC_2V_NEG_H(in28, in29);
    795 
    796   vec4 = in19 - in20;
    797   ST_SH(vec4, interm_ptr + 32);
    798   vec4 = in18 - in21;
    799   ST_SH(vec4, interm_ptr + 88);
    800   vec4 = in29 - in26;
    801   ST_SH(vec4, interm_ptr + 64);
    802   vec4 = in28 - in27;
    803   ST_SH(vec4, interm_ptr + 56);
    804 
    805   ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
    806 
    807   in22 = LD_SH(temp + 48);
    808   in23 = LD_SH(temp + 56);
    809   in24 = LD_SH(temp + 64);
    810   in25 = LD_SH(temp + 72);
    811 
    812   DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
    813   DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
    814   FDCT_POSTPROC_2V_NEG_H(in22, in23);
    815   FDCT_POSTPROC_2V_NEG_H(in24, in25);
    816 
    817   in16 = LD_SH(temp);
    818   in17 = LD_SH(temp + 8);
    819   in30 = LD_SH(temp + 112);
    820   in31 = LD_SH(temp + 120);
    821 
    822   FDCT_POSTPROC_2V_NEG_H(in16, in17);
    823   FDCT_POSTPROC_2V_NEG_H(in30, in31);
    824 
    825   vec4 = in17 - in22;
    826   ST_SH(vec4, interm_ptr + 40);
    827   vec4 = in30 - in25;
    828   ST_SH(vec4, interm_ptr + 48);
    829   vec4 = in31 - in24;
    830   ST_SH(vec4, interm_ptr + 72);
    831   vec4 = in16 - in23;
    832   ST_SH(vec4, interm_ptr + 80);
    833 
    834   ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
    835   DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
    836   DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
    837   ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
    838   DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
    839   ADD2(in27, in26, in25, in24, in23, in20);
    840   DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
    841   ST_SH(vec5, out);
    842   ST_SH(vec4, out + 120);
    843 
    844   SUB2(in27, in26, in25, in24, in22, in21);
    845   DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
    846   ST_SH(vec5, out + 112);
    847   ST_SH(vec4, out + 8);
    848 
    849   SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
    850   DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
    851   SUB2(in26, in27, in24, in25, in23, in20);
    852   DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
    853   ST_SH(vec4, out + 16);
    854   ST_SH(vec5, out + 104);
    855 
    856   ADD2(in26, in27, in24, in25, in22, in21);
    857   DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
    858   ST_SH(vec4, out + 24);
    859   ST_SH(vec5, out + 96);
    860 
    861   in20 = LD_SH(interm_ptr + 32);
    862   in21 = LD_SH(interm_ptr + 88);
    863   in27 = LD_SH(interm_ptr + 56);
    864   in26 = LD_SH(interm_ptr + 64);
    865 
    866   in16 = in20;
    867   in17 = in21;
    868   DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
    869   DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
    870 
    871   in22 = LD_SH(interm_ptr + 40);
    872   in25 = LD_SH(interm_ptr + 48);
    873   in24 = LD_SH(interm_ptr + 72);
    874   in23 = LD_SH(interm_ptr + 80);
    875 
    876   SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
    877   DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
    878   in16 = in28 + in29;
    879   in19 = in31 + in30;
    880   DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
    881   ST_SH(vec5, out + 32);
    882   ST_SH(vec4, out + 88);
    883 
    884   SUB2(in28, in29, in31, in30, in17, in18);
    885   DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
    886   ST_SH(vec5, out + 40);
    887   ST_SH(vec4, out + 80);
    888 
    889   ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
    890   DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
    891   SUB2(in29, in28, in30, in31, in16, in19);
    892   DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
    893   ST_SH(vec5, out + 72);
    894   ST_SH(vec4, out + 48);
    895 
    896   ADD2(in29, in28, in30, in31, in17, in18);
    897   DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
    898   ST_SH(vec4, out + 56);
    899   ST_SH(vec5, out + 64);
    900 }
    901 
    902 static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
    903                                int16_t *output) {
    904   fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
    905   fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
    906   fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
    907   fdct8x32_1d_row_transpose_store(tmp_buf, output);
    908 }
    909 
    910 void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
    911                           int32_t src_stride) {
    912   int32_t i;
    913   DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
    914   DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
    915 
    916   /* column transform */
    917   for (i = 0; i < 4; ++i) {
    918     fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
    919                        &tmp_buf_big[0] + (8 * i));
    920   }
    921 
    922   /* row transform */
    923   for (i = 0; i < 4; ++i) {
    924     fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
    925                        out + (8 * i * 32));
    926   }
    927 }
    928 
    929 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
    930   int sum, i;
    931   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    932   v4i32 vec_w = { 0 };
    933 
    934   for (i = 0; i < 16; ++i) {
    935     LD_SH4(input, 8, in0, in1, in2, in3);
    936     input += stride;
    937     LD_SH4(input, 8, in4, in5, in6, in7);
    938     input += stride;
    939     ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
    940     ADD2(in0, in2, in4, in6, in0, in4);
    941     vec_w += __msa_hadd_s_w(in0, in0);
    942     vec_w += __msa_hadd_s_w(in4, in4);
    943   }
    944 
    945   sum = HADD_SW_S32(vec_w);
    946   out[0] = (int16_t)(sum >> 3);
    947 }
    948