Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_dsp/mips/fwd_txfm_msa.h"
     12 
     13 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
     14                                               int32_t src_stride,
     15                                               int16_t *temp_buff) {
     16   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
     17   v8i16 step0, step1, step2, step3;
     18   v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
     19   v8i16 step0_1, step1_1, step2_1, step3_1;
     20 
     21   /* 1st and 2nd set */
     22   LD_SH4(input, src_stride, in0, in1, in2, in3);
     23   LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
     24   LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
     25   LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
     26   SLLI_4V(in0, in1, in2, in3, 2);
     27   SLLI_4V(in4, in5, in6, in7, 2);
     28   SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
     29   SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
     30   BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
     31               step0, step1, step2, step3, in4, in5, in6, in7);
     32   BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
     33               step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
     34   ST_SH4(step0, step1, step2, step3, temp_buff, 8);
     35   ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
     36   ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
     37   ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
     38 
     39   /* 3rd and 4th set */
     40   LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
     41   LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
     42   LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
     43   LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
     44   SLLI_4V(in0, in1, in2, in3, 2);
     45   SLLI_4V(in4, in5, in6, in7, 2);
     46   SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
     47   SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
     48   BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
     49               step0, step1, step2, step3, in4, in5, in6, in7);
     50   BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
     51               step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
     52   ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
     53   ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
     54   ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
     55   ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
     56 }
     57 
     58 static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
     59   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
     60   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
     61   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
     62   v8i16 temp0, temp1;
     63 
     64   /* fdct even */
     65   LD_SH4(input, 8, in0, in1, in2, in3);
     66   LD_SH4(input + 96, 8, in12, in13, in14, in15);
     67   BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
     68               vec0, vec1, vec2, vec3, in12, in13, in14, in15);
     69   LD_SH4(input + 32, 8, in4, in5, in6, in7);
     70   LD_SH4(input + 64, 8, in8, in9, in10, in11);
     71   BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
     72               vec4, vec5, vec6, vec7, in8, in9, in10, in11);
     73 
     74   /* Stage 3 */
     75   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
     76   BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
     77   DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
     78   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     79   ST_SH(temp0, temp);
     80   ST_SH(temp1, temp + 512);
     81 
     82   DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
     83   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     84   ST_SH(temp0, temp + 256);
     85   ST_SH(temp1, temp + 768);
     86 
     87   SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
     88   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
     89   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
     90   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
     91   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     92   ST_SH(temp0, temp + 128);
     93   ST_SH(temp1, temp + 896);
     94 
     95   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
     96   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
     97   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
     98   ST_SH(temp0, temp + 640);
     99   ST_SH(temp1, temp + 384);
    100 
    101   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    102   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    103   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    104   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    105   ADD2(in0, in1, in2, in3, vec0, vec7);
    106   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
    107   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    108   ST_SH(temp0, temp + 64);
    109   ST_SH(temp1, temp + 960);
    110 
    111   SUB2(in0, in1, in2, in3, in0, in2);
    112   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
    113   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    114   ST_SH(temp0, temp + 576);
    115   ST_SH(temp1, temp + 448);
    116 
    117   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    118   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    119   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
    120   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
    121   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    122   ST_SH(temp0, temp + 320);
    123   ST_SH(temp1, temp + 704);
    124 
    125   ADD2(in3, in2, in0, in1, vec3, vec4);
    126   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
    127   FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
    128   ST_SH(temp0, temp + 192);
    129   ST_SH(temp1, temp + 832);
    130 }
    131 
    132 static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
    133   v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
    134   v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
    135 
    136   in20 = LD_SH(input + 32);
    137   in21 = LD_SH(input + 40);
    138   in26 = LD_SH(input + 80);
    139   in27 = LD_SH(input + 88);
    140 
    141   DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
    142   DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
    143 
    144   in18 = LD_SH(input + 16);
    145   in19 = LD_SH(input + 24);
    146   in28 = LD_SH(input + 96);
    147   in29 = LD_SH(input + 104);
    148 
    149   vec4 = in19 - in20;
    150   ST_SH(vec4, input + 32);
    151   vec4 = in18 - in21;
    152   ST_SH(vec4, input + 40);
    153   vec4 = in29 - in26;
    154   ST_SH(vec4, input + 80);
    155   vec4 = in28 - in27;
    156   ST_SH(vec4, input + 88);
    157 
    158   in21 = in18 + in21;
    159   in20 = in19 + in20;
    160   in27 = in28 + in27;
    161   in26 = in29 + in26;
    162 
    163   LD_SH4(input + 48, 8, in22, in23, in24, in25);
    164   DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
    165   DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
    166 
    167   in16 = LD_SH(input);
    168   in17 = LD_SH(input + 8);
    169   in30 = LD_SH(input + 112);
    170   in31 = LD_SH(input + 120);
    171 
    172   vec4 = in17 - in22;
    173   ST_SH(vec4, input + 16);
    174   vec4 = in16 - in23;
    175   ST_SH(vec4, input + 24);
    176   vec4 = in31 - in24;
    177   ST_SH(vec4, input + 96);
    178   vec4 = in30 - in25;
    179   ST_SH(vec4, input + 104);
    180 
    181   ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
    182   DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
    183   DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
    184   ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
    185   DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
    186   ADD2(in27, in26, in25, in24, in23, in20);
    187   DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
    188   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    189   ST_SH(vec5, temp_ptr);
    190   ST_SH(vec4, temp_ptr + 960);
    191 
    192   SUB2(in27, in26, in25, in24, in22, in21);
    193   DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
    194   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    195   ST_SH(vec5, temp_ptr + 448);
    196   ST_SH(vec4, temp_ptr + 512);
    197 
    198   SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
    199   DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
    200   SUB2(in26, in27, in24, in25, in23, in20);
    201   DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
    202   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    203   ST_SH(vec4, temp_ptr + 704);
    204   ST_SH(vec5, temp_ptr + 256);
    205 
    206   ADD2(in26, in27, in24, in25, in22, in21);
    207   DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
    208   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    209   ST_SH(vec4, temp_ptr + 192);
    210   ST_SH(vec5, temp_ptr + 768);
    211 
    212   LD_SH4(input + 16, 8, in22, in23, in20, in21);
    213   LD_SH4(input + 80, 8, in26, in27, in24, in25);
    214   in16 = in20;
    215   in17 = in21;
    216   DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
    217   DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
    218   SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
    219   DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
    220   ADD2(in28, in29, in31, in30, in16, in19);
    221   DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
    222   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    223   ST_SH(vec5, temp_ptr + 832);
    224   ST_SH(vec4, temp_ptr + 128);
    225 
    226   SUB2(in28, in29, in31, in30, in17, in18);
    227   DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
    228   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    229   ST_SH(vec5, temp_ptr + 320);
    230   ST_SH(vec4, temp_ptr + 640);
    231   ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
    232   DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
    233   SUB2(in29, in28, in30, in31, in16, in19);
    234   DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
    235   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    236   ST_SH(vec5, temp_ptr + 576);
    237   ST_SH(vec4, temp_ptr + 384);
    238 
    239   ADD2(in29, in28, in30, in31, in17, in18);
    240   DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
    241   FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
    242   ST_SH(vec5, temp_ptr + 64);
    243   ST_SH(vec4, temp_ptr + 896);
    244 }
    245 
    246 static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
    247                                int16_t *tmp_buf, int16_t *tmp_buf_big) {
    248   fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
    249   fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
    250   fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
    251 }
    252 
    253 static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
    254                                            int16_t *output) {
    255   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    256   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    257   v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
    258 
    259   LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
    260   LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
    261   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
    262                      in0, in1, in2, in3, in4, in5, in6, in7);
    263   TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
    264                      in8, in9, in10, in11, in12, in13, in14, in15);
    265   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
    266                in8, in9, in10, in11, in12, in13, in14, in15,
    267                step0, step1, step2, step3, step4, step5, step6, step7,
    268                in8, in9, in10, in11, in12, in13, in14, in15);
    269   ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
    270   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
    271 
    272   /* 2nd set */
    273   LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
    274   LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
    275   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
    276                      in0, in1, in2, in3, in4, in5, in6, in7);
    277   TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
    278                      in8, in9, in10, in11, in12, in13, in14, in15);
    279   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
    280                in8, in9, in10, in11, in12, in13, in14, in15,
    281                step0, step1, step2, step3, step4, step5, step6, step7,
    282                in8, in9, in10, in11, in12, in13, in14, in15);
    283   ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
    284          (output + 8 * 8), 8);
    285   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
    286 }
    287 
    288 static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
    289                                     int16_t *out) {
    290   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    291   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    292   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    293   v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
    294   v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
    295   v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
    296 
    297   /* fdct32 even */
    298   /* stage 2 */
    299   LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    300   LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    301 
    302   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
    303                in8, in9, in10, in11, in12, in13, in14, in15,
    304                vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
    305                in8, in9, in10, in11, in12, in13, in14, in15);
    306   ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
    307   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
    308 
    309   /* Stage 3 */
    310   UNPCK_SH_SW(vec0, vec0_l, vec0_r);
    311   UNPCK_SH_SW(vec1, vec1_l, vec1_r);
    312   UNPCK_SH_SW(vec2, vec2_l, vec2_r);
    313   UNPCK_SH_SW(vec3, vec3_l, vec3_r);
    314   UNPCK_SH_SW(vec4, vec4_l, vec4_r);
    315   UNPCK_SH_SW(vec5, vec5_l, vec5_r);
    316   UNPCK_SH_SW(vec6, vec6_l, vec6_r);
    317   UNPCK_SH_SW(vec7, vec7_l, vec7_r);
    318   ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
    319        tmp0_w, tmp1_w, tmp2_w, tmp3_w);
    320   BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
    321   ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
    322        vec0_r, vec1_r, vec2_r, vec3_r);
    323 
    324   tmp3_w = vec0_r + vec3_r;
    325   vec0_r = vec0_r - vec3_r;
    326   vec3_r = vec1_r + vec2_r;
    327   vec1_r = vec1_r - vec2_r;
    328 
    329   DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
    330                     cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
    331   FDCT32_POSTPROC_NEG_W(vec4_r);
    332   FDCT32_POSTPROC_NEG_W(tmp3_w);
    333   FDCT32_POSTPROC_NEG_W(vec6_r);
    334   FDCT32_POSTPROC_NEG_W(vec3_r);
    335   PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
    336   ST_SH2(vec5, vec4, out, 8);
    337 
    338   DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
    339                     cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
    340   FDCT32_POSTPROC_NEG_W(vec4_r);
    341   FDCT32_POSTPROC_NEG_W(tmp3_w);
    342   FDCT32_POSTPROC_NEG_W(vec6_r);
    343   FDCT32_POSTPROC_NEG_W(vec3_r);
    344   PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
    345   ST_SH2(vec5, vec4, out + 16, 8);
    346 
    347   LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
    348   SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
    349   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
    350   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
    351   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
    352   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    353   ST_SH(in4, out + 32);
    354   ST_SH(in5, out + 56);
    355 
    356   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
    357   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
    358   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    359   ST_SH(in4, out + 40);
    360   ST_SH(in5, out + 48);
    361 
    362   LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    363   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    364   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    365   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    366   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    367   ADD2(in0, in1, in2, in3, vec0, vec7);
    368   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
    369   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    370   ST_SH(in4, out + 64);
    371   ST_SH(in5, out + 120);
    372 
    373   SUB2(in0, in1, in2, in3, in0, in2);
    374   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
    375   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    376   ST_SH(in4, out + 72);
    377   ST_SH(in5, out + 112);
    378 
    379   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    380   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    381   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
    382   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
    383   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    384   ST_SH(in4, out + 80);
    385   ST_SH(in5, out + 104);
    386 
    387   ADD2(in3, in2, in0, in1, vec3, vec4);
    388   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
    389   FDCT_POSTPROC_2V_NEG_H(in4, in5);
    390   ST_SH(in4, out + 96);
    391   ST_SH(in5, out + 88);
    392 }
    393 
    394 static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
    395   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    396   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    397   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
    398 
    399   /* fdct32 even */
    400   /* stage 2 */
    401   LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    402   LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    403 
    404   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
    405                in8, in9, in10, in11, in12, in13, in14, in15,
    406                vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
    407                in8, in9, in10, in11, in12, in13, in14, in15);
    408 
    409   /* Stage 3 */
    410   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
    411   BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
    412   DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
    413   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    414   ST_SH(temp0, out);
    415   ST_SH(temp1, out + 8);
    416 
    417   DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
    418   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    419   ST_SH(temp0, out + 16);
    420   ST_SH(temp1, out + 24);
    421 
    422   SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
    423   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
    424   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
    425   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
    426   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    427   ST_SH(temp0, out + 32);
    428   ST_SH(temp1, out + 56);
    429 
    430   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
    431   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
    432   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    433   ST_SH(temp0, out + 40);
    434   ST_SH(temp1, out + 48);
    435 
    436   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    437   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    438   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    439   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    440   ADD2(in0, in1, in2, in3, vec0, vec7);
    441   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
    442   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    443   ST_SH(temp0, out + 64);
    444   ST_SH(temp1, out + 120);
    445 
    446   SUB2(in0, in1, in2, in3, in0, in2);
    447   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
    448   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    449   ST_SH(temp0, out + 72);
    450   ST_SH(temp1, out + 112);
    451 
    452   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    453   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    454   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
    455   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
    456   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    457   ST_SH(temp0, out + 80);
    458   ST_SH(temp1, out + 104);
    459 
    460   ADD2(in3, in2, in0, in1, vec3, vec4);
    461   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
    462   FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
    463   ST_SH(temp0, out + 96);
    464   ST_SH(temp1, out + 88);
    465 }
    466 
    467 static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
    468                                 int16_t *out) {
    469   v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
    470   v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
    471 
    472   in20 = LD_SH(temp + 32);
    473   in21 = LD_SH(temp + 40);
    474   in26 = LD_SH(temp + 80);
    475   in27 = LD_SH(temp + 88);
    476 
    477   DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
    478   DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
    479 
    480   in18 = LD_SH(temp + 16);
    481   in19 = LD_SH(temp + 24);
    482   in28 = LD_SH(temp + 96);
    483   in29 = LD_SH(temp + 104);
    484 
    485   vec4 = in19 - in20;
    486   ST_SH(vec4, interm_ptr + 32);
    487   vec4 = in18 - in21;
    488   ST_SH(vec4, interm_ptr + 88);
    489   vec4 = in28 - in27;
    490   ST_SH(vec4, interm_ptr + 56);
    491   vec4 = in29 - in26;
    492   ST_SH(vec4, interm_ptr + 64);
    493 
    494   ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
    495 
    496   in22 = LD_SH(temp + 48);
    497   in23 = LD_SH(temp + 56);
    498   in24 = LD_SH(temp + 64);
    499   in25 = LD_SH(temp + 72);
    500 
    501   DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
    502   DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
    503 
    504   in16 = LD_SH(temp);
    505   in17 = LD_SH(temp + 8);
    506   in30 = LD_SH(temp + 112);
    507   in31 = LD_SH(temp + 120);
    508 
    509   vec4 = in17 - in22;
    510   ST_SH(vec4, interm_ptr + 40);
    511   vec4 = in30 - in25;
    512   ST_SH(vec4, interm_ptr + 48);
    513   vec4 = in31 - in24;
    514   ST_SH(vec4, interm_ptr + 72);
    515   vec4 = in16 - in23;
    516   ST_SH(vec4, interm_ptr + 80);
    517 
    518   ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
    519   DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
    520   DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
    521 
    522   ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
    523   DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
    524   ADD2(in27, in26, in25, in24, in23, in20);
    525 
    526   DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
    527   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    528   ST_SH(vec5, out);
    529   ST_SH(vec4, out + 120);
    530 
    531   SUB2(in27, in26, in25, in24, in22, in21);
    532 
    533   DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
    534   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    535   ST_SH(vec5, out + 112);
    536   ST_SH(vec4, out + 8);
    537 
    538   SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
    539   DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
    540   SUB2(in26, in27, in24, in25, in23, in20);
    541 
    542   DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
    543   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    544   ST_SH(vec4, out + 16);
    545   ST_SH(vec5, out + 104);
    546 
    547   ADD2(in26, in27, in24, in25, in22, in21);
    548   DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
    549   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    550   ST_SH(vec4, out + 24);
    551   ST_SH(vec5, out + 96);
    552 
    553   in20 = LD_SH(interm_ptr + 32);
    554   in21 = LD_SH(interm_ptr + 88);
    555   in27 = LD_SH(interm_ptr + 56);
    556   in26 = LD_SH(interm_ptr + 64);
    557 
    558   in16 = in20;
    559   in17 = in21;
    560   DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
    561   DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
    562 
    563   in22 = LD_SH(interm_ptr + 40);
    564   in25 = LD_SH(interm_ptr + 48);
    565   in24 = LD_SH(interm_ptr + 72);
    566   in23 = LD_SH(interm_ptr + 80);
    567 
    568   SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
    569   DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
    570   ADD2(in28, in29, in31, in30, in16, in19);
    571   DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
    572   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    573   ST_SH(vec5, out + 32);
    574   ST_SH(vec4, out + 88);
    575 
    576   SUB2(in28, in29, in31, in30, in17, in18);
    577   DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
    578   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    579   ST_SH(vec5, out + 40);
    580   ST_SH(vec4, out + 80);
    581 
    582   ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
    583   DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
    584   SUB2(in29, in28, in30, in31, in16, in19);
    585 
    586   DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
    587   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    588   ST_SH(vec5, out + 72);
    589   ST_SH(vec4, out + 48);
    590 
    591   ADD2(in29, in28, in30, in31, in17, in18);
    592 
    593   DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
    594   FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
    595   ST_SH(vec4, out + 56);
    596   ST_SH(vec5, out + 64);
    597 }
    598 
    599 static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
    600   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    601   v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
    602 
    603   /* 1st set */
    604   in0 = LD_SH(temp);
    605   in4 = LD_SH(temp + 32);
    606   in2 = LD_SH(temp + 64);
    607   in6 = LD_SH(temp + 96);
    608   in1 = LD_SH(temp + 128);
    609   in7 = LD_SH(temp + 152);
    610   in3 = LD_SH(temp + 192);
    611   in5 = LD_SH(temp + 216);
    612 
    613   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
    614                      in0, in1, in2, in3, in4, in5, in6, in7);
    615 
    616   /* 2nd set */
    617   in0_1 = LD_SH(temp + 16);
    618   in1_1 = LD_SH(temp + 232);
    619   in2_1 = LD_SH(temp + 80);
    620   in3_1 = LD_SH(temp + 168);
    621   in4_1 = LD_SH(temp + 48);
    622   in5_1 = LD_SH(temp + 176);
    623   in6_1 = LD_SH(temp + 112);
    624   in7_1 = LD_SH(temp + 240);
    625 
    626   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
    627   TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
    628                      in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
    629 
    630   /* 3rd set */
    631   in0 = LD_SH(temp + 8);
    632   in1 = LD_SH(temp + 136);
    633   in2 = LD_SH(temp + 72);
    634   in3 = LD_SH(temp + 200);
    635   in4 = LD_SH(temp + 40);
    636   in5 = LD_SH(temp + 208);
    637   in6 = LD_SH(temp + 104);
    638   in7 = LD_SH(temp + 144);
    639 
    640   ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
    641          output + 8, 32);
    642   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
    643                      in0, in1, in2, in3, in4, in5, in6, in7);
    644   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
    645 
    646   /* 4th set */
    647   in0_1 = LD_SH(temp + 24);
    648   in1_1 = LD_SH(temp + 224);
    649   in2_1 = LD_SH(temp + 88);
    650   in3_1 = LD_SH(temp + 160);
    651   in4_1 = LD_SH(temp + 56);
    652   in5_1 = LD_SH(temp + 184);
    653   in6_1 = LD_SH(temp + 120);
    654   in7_1 = LD_SH(temp + 248);
    655 
    656   TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
    657                      in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
    658   ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
    659          output + 24, 32);
    660 }
    661 
    662 static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
    663                             int16_t *output) {
    664   fdct8x32_1d_row_load_butterfly(temp, temp_buf);
    665   fdct8x32_1d_row_even(temp_buf, temp_buf);
    666   fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
    667   fdct8x32_1d_row_transpose_store(temp_buf, output);
    668 }
    669 
    670 static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
    671                                int16_t *output) {
    672   fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
    673   fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
    674   fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
    675   fdct8x32_1d_row_transpose_store(tmp_buf, output);
    676 }
    677 
    678 void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
    679                        int32_t src_stride) {
    680   int32_t i;
    681   DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
    682   DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
    683 
    684   /* column transform */
    685   for (i = 0; i < 4; ++i) {
    686     fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
    687                        tmp_buf_big + (8 * i));
    688   }
    689 
    690   /* row transform */
    691   fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
    692 
    693   /* row transform */
    694   for (i = 1; i < 4; ++i) {
    695     fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
    696   }
    697 }
    698 
    699 static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
    700   v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    701   v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    702   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
    703 
    704   /* fdct32 even */
    705   /* stage 2 */
    706   LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    707   LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
    708 
    709   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
    710                in8, in9, in10, in11, in12, in13, in14, in15,
    711                vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
    712                in8, in9, in10, in11, in12, in13, in14, in15);
    713   FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
    714   FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
    715   FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
    716   FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
    717   FDCT_POSTPROC_2V_NEG_H(in8, in9);
    718   FDCT_POSTPROC_2V_NEG_H(in10, in11);
    719   FDCT_POSTPROC_2V_NEG_H(in12, in13);
    720   FDCT_POSTPROC_2V_NEG_H(in14, in15);
    721 
    722   /* Stage 3 */
    723   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
    724 
    725   temp0 = in0 + in3;
    726   in0 = in0 - in3;
    727   in3 = in1 + in2;
    728   in1 = in1 - in2;
    729 
    730   DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
    731   ST_SH(temp0, out);
    732   ST_SH(temp1, out + 8);
    733 
    734   DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
    735   ST_SH(temp0, out + 16);
    736   ST_SH(temp1, out + 24);
    737 
    738   SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
    739   DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
    740   ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
    741   DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
    742   ST_SH(temp0, out + 32);
    743   ST_SH(temp1, out + 56);
    744 
    745   SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
    746   DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
    747   ST_SH(temp0, out + 40);
    748   ST_SH(temp1, out + 48);
    749 
    750   DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
    751   DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
    752   ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
    753   DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
    754   ADD2(in0, in1, in2, in3, vec0, vec7);
    755   DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
    756   ST_SH(temp0, out + 64);
    757   ST_SH(temp1, out + 120);
    758 
    759   SUB2(in0, in1, in2, in3, in0, in2);
    760   DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
    761   ST_SH(temp0, out + 72);
    762   ST_SH(temp1, out + 112);
    763 
    764   SUB2(in9, vec2, in14, vec5, vec2, vec5);
    765   DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
    766   SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
    767   DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
    768   ST_SH(temp0, out + 80);
    769   ST_SH(temp1, out + 104);
    770 
    771   ADD2(in3, in2, in0, in1, vec3, vec4);
    772   DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
    773   ST_SH(temp0, out + 96);
    774   ST_SH(temp1, out + 88);
    775 }
    776 
    777 static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
    778                                    int16_t *out) {
    779   v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
    780   v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
    781   v8i16 vec4, vec5;
    782 
    783   in20 = LD_SH(temp + 32);
    784   in21 = LD_SH(temp + 40);
    785   in26 = LD_SH(temp + 80);
    786   in27 = LD_SH(temp + 88);
    787 
    788   DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
    789   DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
    790 
    791   FDCT_POSTPROC_2V_NEG_H(in20, in21);
    792   FDCT_POSTPROC_2V_NEG_H(in26, in27);
    793 
    794   in18 = LD_SH(temp + 16);
    795   in19 = LD_SH(temp + 24);
    796   in28 = LD_SH(temp + 96);
    797   in29 = LD_SH(temp + 104);
    798 
    799   FDCT_POSTPROC_2V_NEG_H(in18, in19);
    800   FDCT_POSTPROC_2V_NEG_H(in28, in29);
    801 
    802   vec4 = in19 - in20;
    803   ST_SH(vec4, interm_ptr + 32);
    804   vec4 = in18 - in21;
    805   ST_SH(vec4, interm_ptr + 88);
    806   vec4 = in29 - in26;
    807   ST_SH(vec4, interm_ptr + 64);
    808   vec4 = in28 - in27;
    809   ST_SH(vec4, interm_ptr + 56);
    810 
    811   ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
    812 
    813   in22 = LD_SH(temp + 48);
    814   in23 = LD_SH(temp + 56);
    815   in24 = LD_SH(temp + 64);
    816   in25 = LD_SH(temp + 72);
    817 
    818   DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
    819   DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
    820   FDCT_POSTPROC_2V_NEG_H(in22, in23);
    821   FDCT_POSTPROC_2V_NEG_H(in24, in25);
    822 
    823   in16 = LD_SH(temp);
    824   in17 = LD_SH(temp + 8);
    825   in30 = LD_SH(temp + 112);
    826   in31 = LD_SH(temp + 120);
    827 
    828   FDCT_POSTPROC_2V_NEG_H(in16, in17);
    829   FDCT_POSTPROC_2V_NEG_H(in30, in31);
    830 
    831   vec4 = in17 - in22;
    832   ST_SH(vec4, interm_ptr + 40);
    833   vec4 = in30 - in25;
    834   ST_SH(vec4, interm_ptr + 48);
    835   vec4 = in31 - in24;
    836   ST_SH(vec4, interm_ptr + 72);
    837   vec4 = in16 - in23;
    838   ST_SH(vec4, interm_ptr + 80);
    839 
    840   ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
    841   DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
    842   DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
    843   ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
    844   DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
    845   ADD2(in27, in26, in25, in24, in23, in20);
    846   DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
    847   ST_SH(vec5, out);
    848   ST_SH(vec4, out + 120);
    849 
    850   SUB2(in27, in26, in25, in24, in22, in21);
    851   DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
    852   ST_SH(vec5, out + 112);
    853   ST_SH(vec4, out + 8);
    854 
    855   SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
    856   DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
    857   SUB2(in26, in27, in24, in25, in23, in20);
    858   DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
    859   ST_SH(vec4, out + 16);
    860   ST_SH(vec5, out + 104);
    861 
    862   ADD2(in26, in27, in24, in25, in22, in21);
    863   DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
    864   ST_SH(vec4, out + 24);
    865   ST_SH(vec5, out + 96);
    866 
    867   in20 = LD_SH(interm_ptr + 32);
    868   in21 = LD_SH(interm_ptr + 88);
    869   in27 = LD_SH(interm_ptr + 56);
    870   in26 = LD_SH(interm_ptr + 64);
    871 
    872   in16 = in20;
    873   in17 = in21;
    874   DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
    875   DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
    876 
    877   in22 = LD_SH(interm_ptr + 40);
    878   in25 = LD_SH(interm_ptr + 48);
    879   in24 = LD_SH(interm_ptr + 72);
    880   in23 = LD_SH(interm_ptr + 80);
    881 
    882   SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
    883   DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
    884   in16 = in28 + in29;
    885   in19 = in31 + in30;
    886   DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
    887   ST_SH(vec5, out + 32);
    888   ST_SH(vec4, out + 88);
    889 
    890   SUB2(in28, in29, in31, in30, in17, in18);
    891   DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
    892   ST_SH(vec5, out + 40);
    893   ST_SH(vec4, out + 80);
    894 
    895   ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
    896   DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
    897   SUB2(in29, in28, in30, in31, in16, in19);
    898   DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
    899   ST_SH(vec5, out + 72);
    900   ST_SH(vec4, out + 48);
    901 
    902   ADD2(in29, in28, in30, in31, in17, in18);
    903   DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
    904   ST_SH(vec4, out + 56);
    905   ST_SH(vec5, out + 64);
    906 }
    907 
    908 static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
    909                                int16_t *output) {
    910   fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
    911   fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
    912   fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
    913   fdct8x32_1d_row_transpose_store(tmp_buf, output);
    914 }
    915 
    916 void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
    917                           int32_t src_stride) {
    918   int32_t i;
    919   DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
    920   DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
    921 
    922   /* column transform */
    923   for (i = 0; i < 4; ++i) {
    924     fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
    925                        &tmp_buf_big[0] + (8 * i));
    926   }
    927 
    928   /* row transform */
    929   for (i = 0; i < 4; ++i) {
    930     fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
    931                        out + (8 * i * 32));
    932   }
    933 }
    934 
    935 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
    936   out[1] = 0;
    937 
    938   out[0] = LD_HADD(input, stride);
    939   out[0] += LD_HADD(input + 8, stride);
    940   out[0] += LD_HADD(input + 16, stride);
    941   out[0] += LD_HADD(input + 24, stride);
    942   out[0] += LD_HADD(input + 32 * 8, stride);
    943   out[0] += LD_HADD(input + 32 * 8 + 8, stride);
    944   out[0] += LD_HADD(input + 32 * 8 + 16, stride);
    945   out[0] += LD_HADD(input + 32 * 8 + 24, stride);
    946   out[0] += LD_HADD(input + 32 * 16, stride);
    947   out[0] += LD_HADD(input + 32 * 16 + 8, stride);
    948   out[0] += LD_HADD(input + 32 * 16 + 16, stride);
    949   out[0] += LD_HADD(input + 32 * 16 + 24, stride);
    950   out[0] += LD_HADD(input + 32 * 24, stride);
    951   out[0] += LD_HADD(input + 32 * 24 + 8, stride);
    952   out[0] += LD_HADD(input + 32 * 24 + 16, stride);
    953   out[0] += LD_HADD(input + 32 * 24 + 24, stride);
    954   out[0] >>= 3;
    955 }
    956