Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_dsp/mips/inv_txfm_msa.h"
     12 
     13 static void idct32x8_row_transpose_store(const int16_t *input,
     14                                          int16_t *tmp_buf) {
     15   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
     16 
     17   /* 1st & 2nd 8x8 */
     18   LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
     19   LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
     20   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
     21                      n3);
     22   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
     23                      n7);
     24   ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
     25   ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
     26   ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
     27 
     28   /* 3rd & 4th 8x8 */
     29   LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
     30   LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
     31   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
     32                      n3);
     33   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
     34                      n7);
     35   ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
     36   ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
     37   ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
     38   ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
     39 }
     40 
     41 static void idct32x8_row_even_process_store(int16_t *tmp_buf,
     42                                             int16_t *tmp_eve_buf) {
     43   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
     44   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
     45   v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
     46 
     47   /* Even stage 1 */
     48   LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
     49 
     50   DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
     51   DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
     52   BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
     53   DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
     54 
     55   loc1 = vec3;
     56   loc0 = vec1;
     57 
     58   DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
     59   DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
     60   BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
     61   BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
     62   BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
     63 
     64   /* Even stage 2 */
     65   LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
     66   DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
     67   DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
     68   DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
     69   DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
     70 
     71   vec0 = reg0 + reg4;
     72   reg0 = reg0 - reg4;
     73   reg4 = reg6 + reg2;
     74   reg6 = reg6 - reg2;
     75   reg2 = reg1 + reg5;
     76   reg1 = reg1 - reg5;
     77   reg5 = reg7 + reg3;
     78   reg7 = reg7 - reg3;
     79   reg3 = vec0;
     80 
     81   vec1 = reg2;
     82   reg2 = reg3 + reg4;
     83   reg3 = reg3 - reg4;
     84   reg4 = reg5 - vec1;
     85   reg5 = reg5 + vec1;
     86 
     87   DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
     88   DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
     89 
     90   vec0 = reg0 - reg6;
     91   reg0 = reg0 + reg6;
     92   vec1 = reg7 - reg1;
     93   reg7 = reg7 + reg1;
     94 
     95   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
     96   DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
     97 
     98   /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
     99   BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
    100   ST_SH(loc0, (tmp_eve_buf + 15 * 8));
    101   ST_SH(loc1, (tmp_eve_buf));
    102   ST_SH(loc2, (tmp_eve_buf + 14 * 8));
    103   ST_SH(loc3, (tmp_eve_buf + 8));
    104 
    105   BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
    106   ST_SH(loc0, (tmp_eve_buf + 13 * 8));
    107   ST_SH(loc1, (tmp_eve_buf + 2 * 8));
    108   ST_SH(loc2, (tmp_eve_buf + 12 * 8));
    109   ST_SH(loc3, (tmp_eve_buf + 3 * 8));
    110 
    111   /* Store 8 */
    112   BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
    113   ST_SH(loc0, (tmp_eve_buf + 11 * 8));
    114   ST_SH(loc1, (tmp_eve_buf + 4 * 8));
    115   ST_SH(loc2, (tmp_eve_buf + 10 * 8));
    116   ST_SH(loc3, (tmp_eve_buf + 5 * 8));
    117 
    118   BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
    119   ST_SH(loc0, (tmp_eve_buf + 9 * 8));
    120   ST_SH(loc1, (tmp_eve_buf + 6 * 8));
    121   ST_SH(loc2, (tmp_eve_buf + 8 * 8));
    122   ST_SH(loc3, (tmp_eve_buf + 7 * 8));
    123 }
    124 
    125 static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
    126                                            int16_t *tmp_odd_buf) {
    127   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
    128   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
    129 
    130   /* Odd stage 1 */
    131   reg0 = LD_SH(tmp_buf + 8);
    132   reg1 = LD_SH(tmp_buf + 7 * 8);
    133   reg2 = LD_SH(tmp_buf + 9 * 8);
    134   reg3 = LD_SH(tmp_buf + 15 * 8);
    135   reg4 = LD_SH(tmp_buf + 17 * 8);
    136   reg5 = LD_SH(tmp_buf + 23 * 8);
    137   reg6 = LD_SH(tmp_buf + 25 * 8);
    138   reg7 = LD_SH(tmp_buf + 31 * 8);
    139 
    140   DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
    141   DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
    142   DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
    143   DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
    144 
    145   vec0 = reg0 + reg3;
    146   reg0 = reg0 - reg3;
    147   reg3 = reg7 + reg4;
    148   reg7 = reg7 - reg4;
    149   reg4 = reg1 + reg2;
    150   reg1 = reg1 - reg2;
    151   reg2 = reg6 + reg5;
    152   reg6 = reg6 - reg5;
    153   reg5 = vec0;
    154 
    155   /* 4 Stores */
    156   ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
    157   ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
    158 
    159   SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
    160   DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
    161   ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
    162 
    163   /* 4 Stores */
    164   DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
    165   DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
    166   BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
    167   ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
    168 
    169   DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
    170   ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
    171 
    172   /* Odd stage 2 */
    173   /* 8 loads */
    174   reg0 = LD_SH(tmp_buf + 3 * 8);
    175   reg1 = LD_SH(tmp_buf + 5 * 8);
    176   reg2 = LD_SH(tmp_buf + 11 * 8);
    177   reg3 = LD_SH(tmp_buf + 13 * 8);
    178   reg4 = LD_SH(tmp_buf + 19 * 8);
    179   reg5 = LD_SH(tmp_buf + 21 * 8);
    180   reg6 = LD_SH(tmp_buf + 27 * 8);
    181   reg7 = LD_SH(tmp_buf + 29 * 8);
    182 
    183   DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
    184   DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
    185   DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
    186   DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
    187 
    188   /* 4 Stores */
    189   SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
    190   DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
    191   DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
    192 
    193   BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
    194   ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
    195 
    196   DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
    197   ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
    198 
    199   /* 4 Stores */
    200   ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
    201   BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
    202   ST_SH(reg0, (tmp_odd_buf + 13 * 8));
    203   ST_SH(reg1, (tmp_odd_buf + 14 * 8));
    204 
    205   DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
    206   ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
    207 
    208   /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
    209 
    210   /* Load 8 & Store 8 */
    211   LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
    212   LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
    213 
    214   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
    215   ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
    216 
    217   SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
    218   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
    219 
    220   SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
    221   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
    222   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
    223 
    224   /* Load 8 & Store 8 */
    225   LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
    226   LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
    227 
    228   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
    229   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
    230 
    231   SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
    232   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
    233 
    234   SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
    235   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
    236   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
    237 }
    238 
    239 static void idct_butterfly_transpose_store(int16_t *tmp_buf,
    240                                            int16_t *tmp_eve_buf,
    241                                            int16_t *tmp_odd_buf, int16_t *dst) {
    242   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
    243   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
    244 
    245   /* FINAL BUTTERFLY : Dependency on Even & Odd */
    246   vec0 = LD_SH(tmp_odd_buf);
    247   vec1 = LD_SH(tmp_odd_buf + 9 * 8);
    248   vec2 = LD_SH(tmp_odd_buf + 14 * 8);
    249   vec3 = LD_SH(tmp_odd_buf + 6 * 8);
    250   loc0 = LD_SH(tmp_eve_buf);
    251   loc1 = LD_SH(tmp_eve_buf + 8 * 8);
    252   loc2 = LD_SH(tmp_eve_buf + 4 * 8);
    253   loc3 = LD_SH(tmp_eve_buf + 12 * 8);
    254 
    255   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
    256 
    257   ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
    258   ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
    259   ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
    260   ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
    261 
    262   /* Load 8 & Store 8 */
    263   vec0 = LD_SH(tmp_odd_buf + 4 * 8);
    264   vec1 = LD_SH(tmp_odd_buf + 13 * 8);
    265   vec2 = LD_SH(tmp_odd_buf + 10 * 8);
    266   vec3 = LD_SH(tmp_odd_buf + 3 * 8);
    267   loc0 = LD_SH(tmp_eve_buf + 2 * 8);
    268   loc1 = LD_SH(tmp_eve_buf + 10 * 8);
    269   loc2 = LD_SH(tmp_eve_buf + 6 * 8);
    270   loc3 = LD_SH(tmp_eve_buf + 14 * 8);
    271 
    272   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
    273 
    274   ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
    275   ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
    276   ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
    277   ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
    278 
    279   /* Load 8 & Store 8 */
    280   vec0 = LD_SH(tmp_odd_buf + 2 * 8);
    281   vec1 = LD_SH(tmp_odd_buf + 11 * 8);
    282   vec2 = LD_SH(tmp_odd_buf + 12 * 8);
    283   vec3 = LD_SH(tmp_odd_buf + 7 * 8);
    284   loc0 = LD_SH(tmp_eve_buf + 1 * 8);
    285   loc1 = LD_SH(tmp_eve_buf + 9 * 8);
    286   loc2 = LD_SH(tmp_eve_buf + 5 * 8);
    287   loc3 = LD_SH(tmp_eve_buf + 13 * 8);
    288 
    289   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
    290 
    291   ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
    292   ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
    293   ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
    294   ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
    295 
    296   /* Load 8 & Store 8 */
    297   vec0 = LD_SH(tmp_odd_buf + 5 * 8);
    298   vec1 = LD_SH(tmp_odd_buf + 15 * 8);
    299   vec2 = LD_SH(tmp_odd_buf + 8 * 8);
    300   vec3 = LD_SH(tmp_odd_buf + 1 * 8);
    301   loc0 = LD_SH(tmp_eve_buf + 3 * 8);
    302   loc1 = LD_SH(tmp_eve_buf + 11 * 8);
    303   loc2 = LD_SH(tmp_eve_buf + 7 * 8);
    304   loc3 = LD_SH(tmp_eve_buf + 15 * 8);
    305 
    306   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
    307 
    308   ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
    309   ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
    310   ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
    311   ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
    312 
    313   /* Transpose : 16 vectors */
    314   /* 1st & 2nd 8x8 */
    315   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
    316                      n3);
    317   ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
    318   ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
    319 
    320   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
    321                      n7);
    322   ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
    323   ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
    324 
    325   /* 3rd & 4th 8x8 */
    326   LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
    327   LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
    328   TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
    329                      n3);
    330   ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
    331   ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
    332 
    333   TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
    334                      n7);
    335   ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
    336   ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
    337 }
    338 
    339 static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
    340   DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
    341   DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
    342   DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
    343 
    344   idct32x8_row_transpose_store(input, &tmp_buf[0]);
    345   idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
    346   idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
    347   idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
    348                                  output);
    349 }
    350 
    351 static void idct8x32_column_even_process_store(int16_t *tmp_buf,
    352                                                int16_t *tmp_eve_buf) {
    353   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
    354   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
    355   v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
    356 
    357   /* Even stage 1 */
    358   LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
    359   tmp_buf += (2 * 32);
    360 
    361   DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
    362   DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
    363   BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
    364   DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
    365 
    366   loc1 = vec3;
    367   loc0 = vec1;
    368 
    369   DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
    370   DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
    371   BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
    372   BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
    373   BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
    374 
    375   /* Even stage 2 */
    376   /* Load 8 */
    377   LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
    378 
    379   DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
    380   DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
    381   DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
    382   DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
    383 
    384   vec0 = reg0 + reg4;
    385   reg0 = reg0 - reg4;
    386   reg4 = reg6 + reg2;
    387   reg6 = reg6 - reg2;
    388   reg2 = reg1 + reg5;
    389   reg1 = reg1 - reg5;
    390   reg5 = reg7 + reg3;
    391   reg7 = reg7 - reg3;
    392   reg3 = vec0;
    393 
    394   vec1 = reg2;
    395   reg2 = reg3 + reg4;
    396   reg3 = reg3 - reg4;
    397   reg4 = reg5 - vec1;
    398   reg5 = reg5 + vec1;
    399 
    400   DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
    401   DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
    402 
    403   vec0 = reg0 - reg6;
    404   reg0 = reg0 + reg6;
    405   vec1 = reg7 - reg1;
    406   reg7 = reg7 + reg1;
    407 
    408   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
    409   DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
    410 
    411   /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
    412   /* Store 8 */
    413   BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
    414   ST_SH2(loc1, loc3, tmp_eve_buf, 8);
    415   ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
    416 
    417   BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
    418   ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
    419   ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
    420 
    421   /* Store 8 */
    422   BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
    423   ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
    424   ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
    425 
    426   BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
    427   ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
    428   ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
    429 }
    430 
    431 static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
    432                                               int16_t *tmp_odd_buf) {
    433   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
    434   v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
    435 
    436   /* Odd stage 1 */
    437   reg0 = LD_SH(tmp_buf + 32);
    438   reg1 = LD_SH(tmp_buf + 7 * 32);
    439   reg2 = LD_SH(tmp_buf + 9 * 32);
    440   reg3 = LD_SH(tmp_buf + 15 * 32);
    441   reg4 = LD_SH(tmp_buf + 17 * 32);
    442   reg5 = LD_SH(tmp_buf + 23 * 32);
    443   reg6 = LD_SH(tmp_buf + 25 * 32);
    444   reg7 = LD_SH(tmp_buf + 31 * 32);
    445 
    446   DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
    447   DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
    448   DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
    449   DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
    450 
    451   vec0 = reg0 + reg3;
    452   reg0 = reg0 - reg3;
    453   reg3 = reg7 + reg4;
    454   reg7 = reg7 - reg4;
    455   reg4 = reg1 + reg2;
    456   reg1 = reg1 - reg2;
    457   reg2 = reg6 + reg5;
    458   reg6 = reg6 - reg5;
    459   reg5 = vec0;
    460 
    461   /* 4 Stores */
    462   ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
    463   ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
    464   SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
    465   DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
    466   ST_SH2(vec0, vec1, tmp_odd_buf, 8);
    467 
    468   /* 4 Stores */
    469   DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
    470   DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
    471   BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
    472   ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
    473   DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
    474   ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
    475 
    476   /* Odd stage 2 */
    477   /* 8 loads */
    478   reg0 = LD_SH(tmp_buf + 3 * 32);
    479   reg1 = LD_SH(tmp_buf + 5 * 32);
    480   reg2 = LD_SH(tmp_buf + 11 * 32);
    481   reg3 = LD_SH(tmp_buf + 13 * 32);
    482   reg4 = LD_SH(tmp_buf + 19 * 32);
    483   reg5 = LD_SH(tmp_buf + 21 * 32);
    484   reg6 = LD_SH(tmp_buf + 27 * 32);
    485   reg7 = LD_SH(tmp_buf + 29 * 32);
    486 
    487   DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
    488   DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
    489   DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
    490   DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
    491 
    492   /* 4 Stores */
    493   SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
    494   DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
    495   DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
    496   BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
    497   ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
    498   DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
    499   ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
    500 
    501   /* 4 Stores */
    502   ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
    503   BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
    504   ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
    505   DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
    506   ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
    507 
    508   /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
    509   /* Load 8 & Store 8 */
    510   LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
    511   LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
    512 
    513   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
    514   ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
    515 
    516   SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
    517   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
    518 
    519   SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
    520   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
    521   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
    522 
    523   /* Load 8 & Store 8 */
    524   LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
    525   LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
    526 
    527   ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
    528   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
    529 
    530   SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
    531   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
    532 
    533   SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
    534   DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
    535   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
    536 }
    537 
    538 static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
    539                                              int16_t *tmp_odd_buf, uint8_t *dst,
    540                                              int32_t dst_stride) {
    541   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
    542   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
    543 
    544   /* FINAL BUTTERFLY : Dependency on Even & Odd */
    545   vec0 = LD_SH(tmp_odd_buf);
    546   vec1 = LD_SH(tmp_odd_buf + 9 * 8);
    547   vec2 = LD_SH(tmp_odd_buf + 14 * 8);
    548   vec3 = LD_SH(tmp_odd_buf + 6 * 8);
    549   loc0 = LD_SH(tmp_eve_buf);
    550   loc1 = LD_SH(tmp_eve_buf + 8 * 8);
    551   loc2 = LD_SH(tmp_eve_buf + 4 * 8);
    552   loc3 = LD_SH(tmp_eve_buf + 12 * 8);
    553 
    554   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
    555   SRARI_H4_SH(m0, m2, m4, m6, 6);
    556   VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
    557 
    558   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
    559   SRARI_H4_SH(m0, m2, m4, m6, 6);
    560   VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
    561                       m6);
    562 
    563   /* Load 8 & Store 8 */
    564   vec0 = LD_SH(tmp_odd_buf + 4 * 8);
    565   vec1 = LD_SH(tmp_odd_buf + 13 * 8);
    566   vec2 = LD_SH(tmp_odd_buf + 10 * 8);
    567   vec3 = LD_SH(tmp_odd_buf + 3 * 8);
    568   loc0 = LD_SH(tmp_eve_buf + 2 * 8);
    569   loc1 = LD_SH(tmp_eve_buf + 10 * 8);
    570   loc2 = LD_SH(tmp_eve_buf + 6 * 8);
    571   loc3 = LD_SH(tmp_eve_buf + 14 * 8);
    572 
    573   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
    574   SRARI_H4_SH(m1, m3, m5, m7, 6);
    575   VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
    576 
    577   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
    578   SRARI_H4_SH(m1, m3, m5, m7, 6);
    579   VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
    580                       m7);
    581 
    582   /* Load 8 & Store 8 */
    583   vec0 = LD_SH(tmp_odd_buf + 2 * 8);
    584   vec1 = LD_SH(tmp_odd_buf + 11 * 8);
    585   vec2 = LD_SH(tmp_odd_buf + 12 * 8);
    586   vec3 = LD_SH(tmp_odd_buf + 7 * 8);
    587   loc0 = LD_SH(tmp_eve_buf + 1 * 8);
    588   loc1 = LD_SH(tmp_eve_buf + 9 * 8);
    589   loc2 = LD_SH(tmp_eve_buf + 5 * 8);
    590   loc3 = LD_SH(tmp_eve_buf + 13 * 8);
    591 
    592   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
    593   SRARI_H4_SH(n0, n2, n4, n6, 6);
    594   VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
    595 
    596   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
    597   SRARI_H4_SH(n0, n2, n4, n6, 6);
    598   VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
    599                       n6);
    600 
    601   /* Load 8 & Store 8 */
    602   vec0 = LD_SH(tmp_odd_buf + 5 * 8);
    603   vec1 = LD_SH(tmp_odd_buf + 15 * 8);
    604   vec2 = LD_SH(tmp_odd_buf + 8 * 8);
    605   vec3 = LD_SH(tmp_odd_buf + 1 * 8);
    606   loc0 = LD_SH(tmp_eve_buf + 3 * 8);
    607   loc1 = LD_SH(tmp_eve_buf + 11 * 8);
    608   loc2 = LD_SH(tmp_eve_buf + 7 * 8);
    609   loc3 = LD_SH(tmp_eve_buf + 15 * 8);
    610 
    611   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
    612   SRARI_H4_SH(n1, n3, n5, n7, 6);
    613   VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
    614 
    615   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
    616   SRARI_H4_SH(n1, n3, n5, n7, 6);
    617   VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
    618                       n7);
    619 }
    620 
    621 static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
    622                                            int32_t dst_stride) {
    623   DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
    624   DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
    625 
    626   idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
    627   idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
    628   idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
    629                                    dst_stride);
    630 }
    631 
    632 void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
    633                                 int32_t dst_stride) {
    634   int32_t i;
    635   DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
    636   int16_t *out_ptr = out_arr;
    637 
    638   /* transform rows */
    639   for (i = 0; i < 4; ++i) {
    640     /* process 32 * 8 block */
    641     idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
    642   }
    643 
    644   /* transform columns */
    645   for (i = 0; i < 4; ++i) {
    646     /* process 8 * 32 block */
    647     idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
    648                                    dst_stride);
    649   }
    650 }
    651 
    652 void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
    653                               int32_t dst_stride) {
    654   int32_t i;
    655   DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
    656   int16_t *out_ptr = out_arr;
    657 
    658   for (i = 32; i--;) {
    659     __asm__ __volatile__(
    660         "sw     $zero,      0(%[out_ptr])     \n\t"
    661         "sw     $zero,      4(%[out_ptr])     \n\t"
    662         "sw     $zero,      8(%[out_ptr])     \n\t"
    663         "sw     $zero,     12(%[out_ptr])     \n\t"
    664         "sw     $zero,     16(%[out_ptr])     \n\t"
    665         "sw     $zero,     20(%[out_ptr])     \n\t"
    666         "sw     $zero,     24(%[out_ptr])     \n\t"
    667         "sw     $zero,     28(%[out_ptr])     \n\t"
    668         "sw     $zero,     32(%[out_ptr])     \n\t"
    669         "sw     $zero,     36(%[out_ptr])     \n\t"
    670         "sw     $zero,     40(%[out_ptr])     \n\t"
    671         "sw     $zero,     44(%[out_ptr])     \n\t"
    672         "sw     $zero,     48(%[out_ptr])     \n\t"
    673         "sw     $zero,     52(%[out_ptr])     \n\t"
    674         "sw     $zero,     56(%[out_ptr])     \n\t"
    675         "sw     $zero,     60(%[out_ptr])     \n\t"
    676 
    677         :
    678         : [out_ptr] "r"(out_ptr));
    679 
    680     out_ptr += 32;
    681   }
    682 
    683   out_ptr = out_arr;
    684 
    685   /* rows: only upper-left 8x8 has non-zero coeff */
    686   idct32x8_1d_rows_msa(input, out_ptr);
    687 
    688   /* transform columns */
    689   for (i = 0; i < 4; ++i) {
    690     /* process 8 * 32 block */
    691     idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
    692                                    dst_stride);
    693   }
    694 }
    695 
    696 void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
    697                              int32_t dst_stride) {
    698   int32_t i;
    699   int16_t out;
    700   v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
    701   v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
    702 
    703   out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
    704   out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
    705   out = ROUND_POWER_OF_TWO(out, 6);
    706 
    707   vec = __msa_fill_h(out);
    708 
    709   for (i = 16; i--;) {
    710     LD_UB2(dst, 16, dst0, dst1);
    711     LD_UB2(dst + dst_stride, 16, dst2, dst3);
    712 
    713     UNPCK_UB_SH(dst0, res0, res4);
    714     UNPCK_UB_SH(dst1, res1, res5);
    715     UNPCK_UB_SH(dst2, res2, res6);
    716     UNPCK_UB_SH(dst3, res3, res7);
    717     ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
    718     ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
    719     CLIP_SH4_0_255(res0, res1, res2, res3);
    720     CLIP_SH4_0_255(res4, res5, res6, res7);
    721     PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
    722                 tmp2, tmp3);
    723 
    724     ST_UB2(tmp0, tmp1, dst, 16);
    725     dst += dst_stride;
    726     ST_UB2(tmp2, tmp3, dst, 16);
    727     dst += dst_stride;
    728   }
    729 }
    730