Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/mips/macros_msa.h"
     13 
     14 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
     15   out0 = __msa_subs_u_h(out0, in0);                \
     16   out1 = __msa_subs_u_h(out1, in1);                \
     17 }
     18 
     19 static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
     20                                        int32_t dst_stride) {
     21   uint32_t src_data;
     22 
     23   src_data = LW(src);
     24 
     25   SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
     26 }
     27 
     28 static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
     29                                        int32_t dst_stride) {
     30   uint32_t row;
     31   uint32_t src_data1, src_data2;
     32 
     33   src_data1 = LW(src);
     34   src_data2 = LW(src + 4);
     35 
     36   for (row = 8; row--;) {
     37     SW(src_data1, dst);
     38     SW(src_data2, (dst + 4));
     39     dst += dst_stride;
     40   }
     41 }
     42 
     43 static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
     44                                          int32_t dst_stride) {
     45   uint32_t row;
     46   v16u8 src0;
     47 
     48   src0 = LD_UB(src);
     49 
     50   for (row = 16; row--;) {
     51     ST_UB(src0, dst);
     52     dst += dst_stride;
     53   }
     54 }
     55 
     56 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
     57                                          int32_t dst_stride) {
     58   uint32_t row;
     59   v16u8 src1, src2;
     60 
     61   src1 = LD_UB(src);
     62   src2 = LD_UB(src + 16);
     63 
     64   for (row = 32; row--;) {
     65     ST_UB2(src1, src2, dst, 16);
     66     dst += dst_stride;
     67   }
     68 }
     69 
     70 static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
     71                                         int32_t dst_stride) {
     72   uint32_t out0, out1, out2, out3;
     73 
     74   out0 = src[0] * 0x01010101;
     75   out1 = src[1] * 0x01010101;
     76   out2 = src[2] * 0x01010101;
     77   out3 = src[3] * 0x01010101;
     78 
     79   SW4(out0, out1, out2, out3, dst, dst_stride);
     80 }
     81 
     82 static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
     83                                         int32_t dst_stride) {
     84   uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
     85 
     86   out0 = src[0] * 0x0101010101010101ull;
     87   out1 = src[1] * 0x0101010101010101ull;
     88   out2 = src[2] * 0x0101010101010101ull;
     89   out3 = src[3] * 0x0101010101010101ull;
     90   out4 = src[4] * 0x0101010101010101ull;
     91   out5 = src[5] * 0x0101010101010101ull;
     92   out6 = src[6] * 0x0101010101010101ull;
     93   out7 = src[7] * 0x0101010101010101ull;
     94 
     95   SD4(out0, out1, out2, out3, dst, dst_stride);
     96   dst += (4 * dst_stride);
     97   SD4(out4, out5, out6, out7, dst, dst_stride);
     98 }
     99 
    100 static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
    101                                           int32_t dst_stride) {
    102   uint32_t row;
    103   uint8_t inp0, inp1, inp2, inp3;
    104   v16u8 src0, src1, src2, src3;
    105 
    106   for (row = 4; row--;) {
    107     inp0 = src[0];
    108     inp1 = src[1];
    109     inp2 = src[2];
    110     inp3 = src[3];
    111     src += 4;
    112 
    113     src0 = (v16u8)__msa_fill_b(inp0);
    114     src1 = (v16u8)__msa_fill_b(inp1);
    115     src2 = (v16u8)__msa_fill_b(inp2);
    116     src3 = (v16u8)__msa_fill_b(inp3);
    117 
    118     ST_UB4(src0, src1, src2, src3, dst, dst_stride);
    119     dst += (4 * dst_stride);
    120   }
    121 }
    122 
    123 static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
    124                                           int32_t dst_stride) {
    125   uint32_t row;
    126   uint8_t inp0, inp1, inp2, inp3;
    127   v16u8 src0, src1, src2, src3;
    128 
    129   for (row = 8; row--;) {
    130     inp0 = src[0];
    131     inp1 = src[1];
    132     inp2 = src[2];
    133     inp3 = src[3];
    134     src += 4;
    135 
    136     src0 = (v16u8)__msa_fill_b(inp0);
    137     src1 = (v16u8)__msa_fill_b(inp1);
    138     src2 = (v16u8)__msa_fill_b(inp2);
    139     src3 = (v16u8)__msa_fill_b(inp3);
    140 
    141     ST_UB2(src0, src0, dst, 16);
    142     dst += dst_stride;
    143     ST_UB2(src1, src1, dst, 16);
    144     dst += dst_stride;
    145     ST_UB2(src2, src2, dst, 16);
    146     dst += dst_stride;
    147     ST_UB2(src3, src3, dst, 16);
    148     dst += dst_stride;
    149   }
    150 }
    151 
    152 static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
    153                                      const uint8_t *src_left,
    154                                      uint8_t *dst, int32_t dst_stride) {
    155   uint32_t val0, val1;
    156   v16i8 store, src = { 0 };
    157   v8u16 sum_h;
    158   v4u32 sum_w;
    159   v2u64 sum_d;
    160 
    161   val0 = LW(src_top);
    162   val1 = LW(src_left);
    163   INSERT_W2_SB(val0, val1, src);
    164   sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
    165   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    166   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    167   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
    168   store = __msa_splati_b((v16i8)sum_w, 0);
    169   val0 = __msa_copy_u_w((v4i32)store, 0);
    170 
    171   SW4(val0, val0, val0, val0, dst, dst_stride);
    172 }
    173 
    174 static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
    175                                         int32_t dst_stride) {
    176   uint32_t val0;
    177   v16i8 store, data = { 0 };
    178   v8u16 sum_h;
    179   v4u32 sum_w;
    180 
    181   val0 = LW(src);
    182   data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
    183   sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
    184   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    185   sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
    186   store = __msa_splati_b((v16i8)sum_w, 0);
    187   val0 = __msa_copy_u_w((v4i32)store, 0);
    188 
    189   SW4(val0, val0, val0, val0, dst, dst_stride);
    190 }
    191 
    192 static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
    193   uint32_t out;
    194   const v16i8 store = __msa_ldi_b(128);
    195 
    196   out = __msa_copy_u_w((v4i32)store, 0);
    197 
    198   SW4(out, out, out, out, dst, dst_stride);
    199 }
    200 
    201 static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
    202                                      const uint8_t *src_left,
    203                                      uint8_t *dst, int32_t dst_stride) {
    204   uint64_t val0, val1;
    205   v16i8 store;
    206   v16u8 src = { 0 };
    207   v8u16 sum_h;
    208   v4u32 sum_w;
    209   v2u64 sum_d;
    210 
    211   val0 = LD(src_top);
    212   val1 = LD(src_left);
    213   INSERT_D2_UB(val0, val1, src);
    214   sum_h = __msa_hadd_u_h(src, src);
    215   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    216   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    217   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    218   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    219   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
    220   store = __msa_splati_b((v16i8)sum_w, 0);
    221   val0 = __msa_copy_u_d((v2i64)store, 0);
    222 
    223   SD4(val0, val0, val0, val0, dst, dst_stride);
    224   dst += (4 * dst_stride);
    225   SD4(val0, val0, val0, val0, dst, dst_stride);
    226 }
    227 
    228 static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
    229                                         int32_t dst_stride) {
    230   uint64_t val0;
    231   v16i8 store;
    232   v16u8 data = { 0 };
    233   v8u16 sum_h;
    234   v4u32 sum_w;
    235   v2u64 sum_d;
    236 
    237   val0 = LD(src);
    238   data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
    239   sum_h = __msa_hadd_u_h(data, data);
    240   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    241   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    242   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
    243   store = __msa_splati_b((v16i8)sum_w, 0);
    244   val0 = __msa_copy_u_d((v2i64)store, 0);
    245 
    246   SD4(val0, val0, val0, val0, dst, dst_stride);
    247   dst += (4 * dst_stride);
    248   SD4(val0, val0, val0, val0, dst, dst_stride);
    249 }
    250 
    251 static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
    252   uint64_t out;
    253   const v16i8 store = __msa_ldi_b(128);
    254 
    255   out = __msa_copy_u_d((v2i64)store, 0);
    256 
    257   SD4(out, out, out, out, dst, dst_stride);
    258   dst += (4 * dst_stride);
    259   SD4(out, out, out, out, dst, dst_stride);
    260 }
    261 
    262 static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
    263                                        const uint8_t *src_left,
    264                                        uint8_t *dst, int32_t dst_stride) {
    265   v16u8 top, left, out;
    266   v8u16 sum_h, sum_top, sum_left;
    267   v4u32 sum_w;
    268   v2u64 sum_d;
    269 
    270   top = LD_UB(src_top);
    271   left = LD_UB(src_left);
    272   HADD_UB2_UH(top, left, sum_top, sum_left);
    273   sum_h = sum_top + sum_left;
    274   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    275   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    276   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    277   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    278   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
    279   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    280 
    281   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    282   dst += (8 * dst_stride);
    283   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    284 }
    285 
    286 static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
    287                                           int32_t dst_stride) {
    288   v16u8 data, out;
    289   v8u16 sum_h;
    290   v4u32 sum_w;
    291   v2u64 sum_d;
    292 
    293   data = LD_UB(src);
    294   sum_h = __msa_hadd_u_h(data, data);
    295   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    296   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    297   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    298   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    299   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
    300   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    301 
    302   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    303   dst += (8 * dst_stride);
    304   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    305 }
    306 
    307 static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
    308   const v16u8 out = (v16u8)__msa_ldi_b(128);
    309 
    310   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    311   dst += (8 * dst_stride);
    312   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    313 }
    314 
    315 static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
    316                                        const uint8_t *src_left,
    317                                        uint8_t *dst, int32_t dst_stride) {
    318   uint32_t row;
    319   v16u8 top0, top1, left0, left1, out;
    320   v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
    321   v4u32 sum_w;
    322   v2u64 sum_d;
    323 
    324   LD_UB2(src_top, 16, top0, top1);
    325   LD_UB2(src_left, 16, left0, left1);
    326   HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
    327   HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
    328   sum_h = sum_top0 + sum_top1;
    329   sum_h += sum_left0 + sum_left1;
    330   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    331   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    332   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    333   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    334   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
    335   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    336 
    337   for (row = 16; row--;) {
    338     ST_UB2(out, out, dst, 16);
    339     dst += dst_stride;
    340     ST_UB2(out, out, dst, 16);
    341     dst += dst_stride;
    342   }
    343 }
    344 
    345 static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
    346                                           int32_t dst_stride) {
    347   uint32_t row;
    348   v16u8 data0, data1, out;
    349   v8u16 sum_h, sum_data0, sum_data1;
    350   v4u32 sum_w;
    351   v2u64 sum_d;
    352 
    353   LD_UB2(src, 16, data0, data1);
    354   HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
    355   sum_h = sum_data0 + sum_data1;
    356   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    357   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    358   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    359   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    360   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
    361   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    362 
    363   for (row = 16; row--;) {
    364     ST_UB2(out, out, dst, 16);
    365     dst += dst_stride;
    366     ST_UB2(out, out, dst, 16);
    367     dst += dst_stride;
    368   }
    369 }
    370 
    371 static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
    372   uint32_t row;
    373   const v16u8 out = (v16u8)__msa_ldi_b(128);
    374 
    375   for (row = 16; row--;) {
    376     ST_UB2(out, out, dst, 16);
    377     dst += dst_stride;
    378     ST_UB2(out, out, dst, 16);
    379     dst += dst_stride;
    380   }
    381 }
    382 
    383 static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
    384                                      const uint8_t *src_left,
    385                                      uint8_t *dst, int32_t dst_stride) {
    386   uint32_t val;
    387   uint8_t top_left = src_top_ptr[-1];
    388   v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
    389   v16u8 src0, src1, src2, src3;
    390   v8u16 src_top_left, vec0, vec1, vec2, vec3;
    391 
    392   src_top_left = (v8u16)__msa_fill_h(top_left);
    393   val = LW(src_top_ptr);
    394   src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
    395 
    396   src_left0 = __msa_fill_b(src_left[0]);
    397   src_left1 = __msa_fill_b(src_left[1]);
    398   src_left2 = __msa_fill_b(src_left[2]);
    399   src_left3 = __msa_fill_b(src_left[3]);
    400 
    401   ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
    402              src_left3, src_top, src0, src1, src2, src3);
    403   HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    404   IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
    405   IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
    406   SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
    407   PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
    408   ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
    409 }
    410 
    411 static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
    412                                      const uint8_t *src_left,
    413                                      uint8_t *dst, int32_t dst_stride) {
    414   uint64_t val;
    415   uint8_t top_left = src_top_ptr[-1];
    416   uint32_t loop_cnt;
    417   v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
    418   v8u16 src_top_left, vec0, vec1, vec2, vec3;
    419   v16u8 src0, src1, src2, src3;
    420 
    421   val = LD(src_top_ptr);
    422   src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
    423   src_top_left = (v8u16)__msa_fill_h(top_left);
    424 
    425   for (loop_cnt = 2; loop_cnt--;) {
    426     src_left0 = __msa_fill_b(src_left[0]);
    427     src_left1 = __msa_fill_b(src_left[1]);
    428     src_left2 = __msa_fill_b(src_left[2]);
    429     src_left3 = __msa_fill_b(src_left[3]);
    430     src_left += 4;
    431 
    432     ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
    433                src_left3, src_top, src0, src1, src2, src3);
    434     HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    435     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
    436     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
    437     SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
    438     PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
    439     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    440     dst += (4 * dst_stride);
    441   }
    442 }
    443 
    444 static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
    445                                        const uint8_t *src_left,
    446                                        uint8_t *dst, int32_t dst_stride) {
    447   uint8_t top_left = src_top_ptr[-1];
    448   uint32_t loop_cnt;
    449   v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
    450   v8u16 src_top_left, res_r, res_l;
    451 
    452   src_top = LD_SB(src_top_ptr);
    453   src_top_left = (v8u16)__msa_fill_h(top_left);
    454 
    455   for (loop_cnt = 4; loop_cnt--;) {
    456     src_left0 = __msa_fill_b(src_left[0]);
    457     src_left1 = __msa_fill_b(src_left[1]);
    458     src_left2 = __msa_fill_b(src_left[2]);
    459     src_left3 = __msa_fill_b(src_left[3]);
    460     src_left += 4;
    461 
    462     ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
    463     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    464     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    465 
    466     SAT_UH2_UH(res_r, res_l, 7);
    467     PCKEV_ST_SB(res_r, res_l, dst);
    468     dst += dst_stride;
    469 
    470     ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
    471     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    472     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    473     SAT_UH2_UH(res_r, res_l, 7);
    474     PCKEV_ST_SB(res_r, res_l, dst);
    475     dst += dst_stride;
    476 
    477     ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
    478     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    479     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    480     SAT_UH2_UH(res_r, res_l, 7);
    481     PCKEV_ST_SB(res_r, res_l, dst);
    482     dst += dst_stride;
    483 
    484     ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
    485     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    486     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    487     SAT_UH2_UH(res_r, res_l, 7);
    488     PCKEV_ST_SB(res_r, res_l, dst);
    489     dst += dst_stride;
    490   }
    491 }
    492 
    493 static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
    494                                        const uint8_t *src_left,
    495                                        uint8_t *dst, int32_t dst_stride) {
    496   uint8_t top_left = src_top[-1];
    497   uint32_t loop_cnt;
    498   v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
    499   v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
    500 
    501   LD_SB2(src_top, 16, src_top0, src_top1);
    502   src_top_left = (v8u16)__msa_fill_h(top_left);
    503 
    504   for (loop_cnt = 8; loop_cnt--;) {
    505     src_left0 = __msa_fill_b(src_left[0]);
    506     src_left1 = __msa_fill_b(src_left[1]);
    507     src_left2 = __msa_fill_b(src_left[2]);
    508     src_left3 = __msa_fill_b(src_left[3]);
    509     src_left += 4;
    510 
    511     ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
    512     ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
    513     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    514     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    515     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    516     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    517     PCKEV_ST_SB(res_r0, res_l0, dst);
    518     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    519     dst += dst_stride;
    520 
    521     ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
    522     ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
    523     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    524     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    525     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    526     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    527     PCKEV_ST_SB(res_r0, res_l0, dst);
    528     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    529     dst += dst_stride;
    530 
    531     ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
    532     ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
    533     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    534     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    535     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    536     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    537     PCKEV_ST_SB(res_r0, res_l0, dst);
    538     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    539     dst += dst_stride;
    540 
    541     ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
    542     ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
    543     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    544     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    545     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    546     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    547     PCKEV_ST_SB(res_r0, res_l0, dst);
    548     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    549     dst += dst_stride;
    550   }
    551 }
    552 
    553 void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    554                              const uint8_t *above, const uint8_t *left) {
    555   (void)left;
    556 
    557   intra_predict_vert_4x4_msa(above, dst, y_stride);
    558 }
    559 
    560 void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    561                              const uint8_t *above, const uint8_t *left) {
    562   (void)left;
    563 
    564   intra_predict_vert_8x8_msa(above, dst, y_stride);
    565 }
    566 
    567 void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    568                                const uint8_t *above, const uint8_t *left) {
    569   (void)left;
    570 
    571   intra_predict_vert_16x16_msa(above, dst, y_stride);
    572 }
    573 
    574 void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    575                                const uint8_t *above, const uint8_t *left) {
    576   (void)left;
    577 
    578   intra_predict_vert_32x32_msa(above, dst, y_stride);
    579 }
    580 
    581 void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    582                              const uint8_t *above, const uint8_t *left) {
    583   (void)above;
    584 
    585   intra_predict_horiz_4x4_msa(left, dst, y_stride);
    586 }
    587 
    588 void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    589                              const uint8_t *above, const uint8_t *left) {
    590   (void)above;
    591 
    592   intra_predict_horiz_8x8_msa(left, dst, y_stride);
    593 }
    594 
    595 void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    596                                const uint8_t *above, const uint8_t *left) {
    597   (void)above;
    598 
    599   intra_predict_horiz_16x16_msa(left, dst, y_stride);
    600 }
    601 
    602 void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    603                                const uint8_t *above, const uint8_t *left) {
    604   (void)above;
    605 
    606   intra_predict_horiz_32x32_msa(left, dst, y_stride);
    607 }
    608 
    609 void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    610                               const uint8_t *above, const uint8_t *left) {
    611   intra_predict_dc_4x4_msa(above, left, dst, y_stride);
    612 }
    613 
    614 void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    615                               const uint8_t *above, const uint8_t *left) {
    616   intra_predict_dc_8x8_msa(above, left, dst, y_stride);
    617 }
    618 
    619 void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    620                                 const uint8_t *above, const uint8_t *left) {
    621   intra_predict_dc_16x16_msa(above, left, dst, y_stride);
    622 }
    623 
    624 void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    625                                 const uint8_t *above, const uint8_t *left) {
    626   intra_predict_dc_32x32_msa(above, left, dst, y_stride);
    627 }
    628 
    629 void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    630                                   const uint8_t *above, const uint8_t *left) {
    631   (void)left;
    632 
    633   intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
    634 }
    635 
    636 void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    637                                   const uint8_t *above, const uint8_t *left) {
    638   (void)left;
    639 
    640   intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
    641 }
    642 
    643 void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    644                                     const uint8_t *above, const uint8_t *left) {
    645   (void)left;
    646 
    647   intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
    648 }
    649 
    650 void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    651                                     const uint8_t *above, const uint8_t *left) {
    652   (void)left;
    653 
    654   intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
    655 }
    656 
    657 void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    658                                    const uint8_t *above, const uint8_t *left) {
    659   (void)above;
    660 
    661   intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
    662 }
    663 
    664 void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    665                                    const uint8_t *above, const uint8_t *left) {
    666   (void)above;
    667 
    668   intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
    669 }
    670 
    671 void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    672                                      const uint8_t *above,
    673                                      const uint8_t *left) {
    674   (void)above;
    675 
    676   intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
    677 }
    678 
    679 void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    680                                      const uint8_t *above,
    681                                      const uint8_t *left) {
    682   (void)above;
    683 
    684   intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
    685 }
    686 
    687 void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    688                                   const uint8_t *above, const uint8_t *left) {
    689   (void)above;
    690   (void)left;
    691 
    692   intra_predict_128dc_4x4_msa(dst, y_stride);
    693 }
    694 
    695 void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    696                                   const uint8_t *above, const uint8_t *left) {
    697   (void)above;
    698   (void)left;
    699 
    700   intra_predict_128dc_8x8_msa(dst, y_stride);
    701 }
    702 
    703 void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    704                                     const uint8_t *above, const uint8_t *left) {
    705   (void)above;
    706   (void)left;
    707 
    708   intra_predict_128dc_16x16_msa(dst, y_stride);
    709 }
    710 
    711 void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    712                                     const uint8_t *above, const uint8_t *left) {
    713   (void)above;
    714   (void)left;
    715 
    716   intra_predict_128dc_32x32_msa(dst, y_stride);
    717 }
    718 
    719 void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    720                               const uint8_t *above, const uint8_t *left) {
    721   intra_predict_tm_4x4_msa(above, left, dst, y_stride);
    722 }
    723 
    724 void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    725                               const uint8_t *above, const uint8_t *left) {
    726   intra_predict_tm_8x8_msa(above, left, dst, y_stride);
    727 }
    728 
    729 void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    730                                 const uint8_t *above, const uint8_t *left) {
    731   intra_predict_tm_16x16_msa(above, left, dst, y_stride);
    732 }
    733 
    734 void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    735                                 const uint8_t *above, const uint8_t *left) {
    736   intra_predict_tm_32x32_msa(above, left, dst, y_stride);
    737 }
    738