Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/mips/macros_msa.h"
     13 
     14 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
     15   {                                             \
     16     out0 = __msa_subs_u_h(out0, in0);           \
     17     out1 = __msa_subs_u_h(out1, in1);           \
     18   }
     19 
     20 static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
     21                                        int32_t dst_stride) {
     22   uint32_t src_data;
     23 
     24   src_data = LW(src);
     25 
     26   SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
     27 }
     28 
     29 static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
     30                                        int32_t dst_stride) {
     31   uint32_t row;
     32   uint32_t src_data1, src_data2;
     33 
     34   src_data1 = LW(src);
     35   src_data2 = LW(src + 4);
     36 
     37   for (row = 8; row--;) {
     38     SW(src_data1, dst);
     39     SW(src_data2, (dst + 4));
     40     dst += dst_stride;
     41   }
     42 }
     43 
     44 static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
     45                                          int32_t dst_stride) {
     46   uint32_t row;
     47   v16u8 src0;
     48 
     49   src0 = LD_UB(src);
     50 
     51   for (row = 16; row--;) {
     52     ST_UB(src0, dst);
     53     dst += dst_stride;
     54   }
     55 }
     56 
     57 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
     58                                          int32_t dst_stride) {
     59   uint32_t row;
     60   v16u8 src1, src2;
     61 
     62   src1 = LD_UB(src);
     63   src2 = LD_UB(src + 16);
     64 
     65   for (row = 32; row--;) {
     66     ST_UB2(src1, src2, dst, 16);
     67     dst += dst_stride;
     68   }
     69 }
     70 
     71 static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
     72                                         int32_t dst_stride) {
     73   uint32_t out0, out1, out2, out3;
     74 
     75   out0 = src[0] * 0x01010101;
     76   out1 = src[1] * 0x01010101;
     77   out2 = src[2] * 0x01010101;
     78   out3 = src[3] * 0x01010101;
     79 
     80   SW4(out0, out1, out2, out3, dst, dst_stride);
     81 }
     82 
     83 static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
     84                                         int32_t dst_stride) {
     85   uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
     86 
     87   out0 = src[0] * 0x0101010101010101ull;
     88   out1 = src[1] * 0x0101010101010101ull;
     89   out2 = src[2] * 0x0101010101010101ull;
     90   out3 = src[3] * 0x0101010101010101ull;
     91   out4 = src[4] * 0x0101010101010101ull;
     92   out5 = src[5] * 0x0101010101010101ull;
     93   out6 = src[6] * 0x0101010101010101ull;
     94   out7 = src[7] * 0x0101010101010101ull;
     95 
     96   SD4(out0, out1, out2, out3, dst, dst_stride);
     97   dst += (4 * dst_stride);
     98   SD4(out4, out5, out6, out7, dst, dst_stride);
     99 }
    100 
    101 static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
    102                                           int32_t dst_stride) {
    103   uint32_t row;
    104   uint8_t inp0, inp1, inp2, inp3;
    105   v16u8 src0, src1, src2, src3;
    106 
    107   for (row = 4; row--;) {
    108     inp0 = src[0];
    109     inp1 = src[1];
    110     inp2 = src[2];
    111     inp3 = src[3];
    112     src += 4;
    113 
    114     src0 = (v16u8)__msa_fill_b(inp0);
    115     src1 = (v16u8)__msa_fill_b(inp1);
    116     src2 = (v16u8)__msa_fill_b(inp2);
    117     src3 = (v16u8)__msa_fill_b(inp3);
    118 
    119     ST_UB4(src0, src1, src2, src3, dst, dst_stride);
    120     dst += (4 * dst_stride);
    121   }
    122 }
    123 
    124 static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
    125                                           int32_t dst_stride) {
    126   uint32_t row;
    127   uint8_t inp0, inp1, inp2, inp3;
    128   v16u8 src0, src1, src2, src3;
    129 
    130   for (row = 8; row--;) {
    131     inp0 = src[0];
    132     inp1 = src[1];
    133     inp2 = src[2];
    134     inp3 = src[3];
    135     src += 4;
    136 
    137     src0 = (v16u8)__msa_fill_b(inp0);
    138     src1 = (v16u8)__msa_fill_b(inp1);
    139     src2 = (v16u8)__msa_fill_b(inp2);
    140     src3 = (v16u8)__msa_fill_b(inp3);
    141 
    142     ST_UB2(src0, src0, dst, 16);
    143     dst += dst_stride;
    144     ST_UB2(src1, src1, dst, 16);
    145     dst += dst_stride;
    146     ST_UB2(src2, src2, dst, 16);
    147     dst += dst_stride;
    148     ST_UB2(src3, src3, dst, 16);
    149     dst += dst_stride;
    150   }
    151 }
    152 
    153 static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
    154                                      const uint8_t *src_left, uint8_t *dst,
    155                                      int32_t dst_stride) {
    156   uint32_t val0, val1;
    157   v16i8 store, src = { 0 };
    158   v8u16 sum_h;
    159   v4u32 sum_w;
    160   v2u64 sum_d;
    161 
    162   val0 = LW(src_top);
    163   val1 = LW(src_left);
    164   INSERT_W2_SB(val0, val1, src);
    165   sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
    166   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    167   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    168   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
    169   store = __msa_splati_b((v16i8)sum_w, 0);
    170   val0 = __msa_copy_u_w((v4i32)store, 0);
    171 
    172   SW4(val0, val0, val0, val0, dst, dst_stride);
    173 }
    174 
    175 static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
    176                                         int32_t dst_stride) {
    177   uint32_t val0;
    178   v16i8 store, data = { 0 };
    179   v8u16 sum_h;
    180   v4u32 sum_w;
    181 
    182   val0 = LW(src);
    183   data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
    184   sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
    185   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    186   sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
    187   store = __msa_splati_b((v16i8)sum_w, 0);
    188   val0 = __msa_copy_u_w((v4i32)store, 0);
    189 
    190   SW4(val0, val0, val0, val0, dst, dst_stride);
    191 }
    192 
    193 static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
    194   uint32_t out;
    195   const v16i8 store = __msa_ldi_b(128);
    196 
    197   out = __msa_copy_u_w((v4i32)store, 0);
    198 
    199   SW4(out, out, out, out, dst, dst_stride);
    200 }
    201 
    202 static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
    203                                      const uint8_t *src_left, uint8_t *dst,
    204                                      int32_t dst_stride) {
    205   uint64_t val0, val1;
    206   v16i8 store;
    207   v16u8 src = { 0 };
    208   v8u16 sum_h;
    209   v4u32 sum_w;
    210   v2u64 sum_d;
    211 
    212   val0 = LD(src_top);
    213   val1 = LD(src_left);
    214   INSERT_D2_UB(val0, val1, src);
    215   sum_h = __msa_hadd_u_h(src, src);
    216   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    217   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    218   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    219   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    220   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
    221   store = __msa_splati_b((v16i8)sum_w, 0);
    222   val0 = __msa_copy_u_d((v2i64)store, 0);
    223 
    224   SD4(val0, val0, val0, val0, dst, dst_stride);
    225   dst += (4 * dst_stride);
    226   SD4(val0, val0, val0, val0, dst, dst_stride);
    227 }
    228 
    229 static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
    230                                         int32_t dst_stride) {
    231   uint64_t val0;
    232   v16i8 store;
    233   v16u8 data = { 0 };
    234   v8u16 sum_h;
    235   v4u32 sum_w;
    236   v2u64 sum_d;
    237 
    238   val0 = LD(src);
    239   data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
    240   sum_h = __msa_hadd_u_h(data, data);
    241   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    242   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    243   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
    244   store = __msa_splati_b((v16i8)sum_w, 0);
    245   val0 = __msa_copy_u_d((v2i64)store, 0);
    246 
    247   SD4(val0, val0, val0, val0, dst, dst_stride);
    248   dst += (4 * dst_stride);
    249   SD4(val0, val0, val0, val0, dst, dst_stride);
    250 }
    251 
    252 static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
    253   uint64_t out;
    254   const v16i8 store = __msa_ldi_b(128);
    255 
    256   out = __msa_copy_u_d((v2i64)store, 0);
    257 
    258   SD4(out, out, out, out, dst, dst_stride);
    259   dst += (4 * dst_stride);
    260   SD4(out, out, out, out, dst, dst_stride);
    261 }
    262 
    263 static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
    264                                        const uint8_t *src_left, uint8_t *dst,
    265                                        int32_t dst_stride) {
    266   v16u8 top, left, out;
    267   v8u16 sum_h, sum_top, sum_left;
    268   v4u32 sum_w;
    269   v2u64 sum_d;
    270 
    271   top = LD_UB(src_top);
    272   left = LD_UB(src_left);
    273   HADD_UB2_UH(top, left, sum_top, sum_left);
    274   sum_h = sum_top + sum_left;
    275   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    276   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    277   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    278   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    279   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
    280   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    281 
    282   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    283   dst += (8 * dst_stride);
    284   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    285 }
    286 
    287 static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
    288                                           int32_t dst_stride) {
    289   v16u8 data, out;
    290   v8u16 sum_h;
    291   v4u32 sum_w;
    292   v2u64 sum_d;
    293 
    294   data = LD_UB(src);
    295   sum_h = __msa_hadd_u_h(data, data);
    296   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    297   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    298   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    299   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    300   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
    301   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    302 
    303   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    304   dst += (8 * dst_stride);
    305   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    306 }
    307 
    308 static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
    309   const v16u8 out = (v16u8)__msa_ldi_b(128);
    310 
    311   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    312   dst += (8 * dst_stride);
    313   ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    314 }
    315 
    316 static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
    317                                        const uint8_t *src_left, uint8_t *dst,
    318                                        int32_t dst_stride) {
    319   uint32_t row;
    320   v16u8 top0, top1, left0, left1, out;
    321   v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
    322   v4u32 sum_w;
    323   v2u64 sum_d;
    324 
    325   LD_UB2(src_top, 16, top0, top1);
    326   LD_UB2(src_left, 16, left0, left1);
    327   HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
    328   HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
    329   sum_h = sum_top0 + sum_top1;
    330   sum_h += sum_left0 + sum_left1;
    331   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    332   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    333   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    334   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    335   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
    336   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    337 
    338   for (row = 16; row--;) {
    339     ST_UB2(out, out, dst, 16);
    340     dst += dst_stride;
    341     ST_UB2(out, out, dst, 16);
    342     dst += dst_stride;
    343   }
    344 }
    345 
    346 static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
    347                                           int32_t dst_stride) {
    348   uint32_t row;
    349   v16u8 data0, data1, out;
    350   v8u16 sum_h, sum_data0, sum_data1;
    351   v4u32 sum_w;
    352   v2u64 sum_d;
    353 
    354   LD_UB2(src, 16, data0, data1);
    355   HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
    356   sum_h = sum_data0 + sum_data1;
    357   sum_w = __msa_hadd_u_w(sum_h, sum_h);
    358   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    359   sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
    360   sum_d = __msa_hadd_u_d(sum_w, sum_w);
    361   sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
    362   out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
    363 
    364   for (row = 16; row--;) {
    365     ST_UB2(out, out, dst, 16);
    366     dst += dst_stride;
    367     ST_UB2(out, out, dst, 16);
    368     dst += dst_stride;
    369   }
    370 }
    371 
    372 static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
    373   uint32_t row;
    374   const v16u8 out = (v16u8)__msa_ldi_b(128);
    375 
    376   for (row = 16; row--;) {
    377     ST_UB2(out, out, dst, 16);
    378     dst += dst_stride;
    379     ST_UB2(out, out, dst, 16);
    380     dst += dst_stride;
    381   }
    382 }
    383 
    384 static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
    385                                      const uint8_t *src_left, uint8_t *dst,
    386                                      int32_t dst_stride) {
    387   uint32_t val;
    388   uint8_t top_left = src_top_ptr[-1];
    389   v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
    390   v16u8 src0, src1, src2, src3;
    391   v8u16 src_top_left, vec0, vec1, vec2, vec3;
    392 
    393   src_top_left = (v8u16)__msa_fill_h(top_left);
    394   val = LW(src_top_ptr);
    395   src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
    396 
    397   src_left0 = __msa_fill_b(src_left[0]);
    398   src_left1 = __msa_fill_b(src_left[1]);
    399   src_left2 = __msa_fill_b(src_left[2]);
    400   src_left3 = __msa_fill_b(src_left[3]);
    401 
    402   ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
    403              src_left3, src_top, src0, src1, src2, src3);
    404   HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    405   IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
    406   IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
    407   SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
    408   PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
    409   ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
    410 }
    411 
    412 static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
    413                                      const uint8_t *src_left, uint8_t *dst,
    414                                      int32_t dst_stride) {
    415   uint64_t val;
    416   uint8_t top_left = src_top_ptr[-1];
    417   uint32_t loop_cnt;
    418   v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
    419   v8u16 src_top_left, vec0, vec1, vec2, vec3;
    420   v16u8 src0, src1, src2, src3;
    421 
    422   val = LD(src_top_ptr);
    423   src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
    424   src_top_left = (v8u16)__msa_fill_h(top_left);
    425 
    426   for (loop_cnt = 2; loop_cnt--;) {
    427     src_left0 = __msa_fill_b(src_left[0]);
    428     src_left1 = __msa_fill_b(src_left[1]);
    429     src_left2 = __msa_fill_b(src_left[2]);
    430     src_left3 = __msa_fill_b(src_left[3]);
    431     src_left += 4;
    432 
    433     ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
    434                src_left3, src_top, src0, src1, src2, src3);
    435     HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    436     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
    437     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
    438     SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
    439     PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
    440     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    441     dst += (4 * dst_stride);
    442   }
    443 }
    444 
    445 static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
    446                                        const uint8_t *src_left, uint8_t *dst,
    447                                        int32_t dst_stride) {
    448   uint8_t top_left = src_top_ptr[-1];
    449   uint32_t loop_cnt;
    450   v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
    451   v8u16 src_top_left, res_r, res_l;
    452 
    453   src_top = LD_SB(src_top_ptr);
    454   src_top_left = (v8u16)__msa_fill_h(top_left);
    455 
    456   for (loop_cnt = 4; loop_cnt--;) {
    457     src_left0 = __msa_fill_b(src_left[0]);
    458     src_left1 = __msa_fill_b(src_left[1]);
    459     src_left2 = __msa_fill_b(src_left[2]);
    460     src_left3 = __msa_fill_b(src_left[3]);
    461     src_left += 4;
    462 
    463     ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
    464     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    465     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    466 
    467     SAT_UH2_UH(res_r, res_l, 7);
    468     PCKEV_ST_SB(res_r, res_l, dst);
    469     dst += dst_stride;
    470 
    471     ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
    472     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    473     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    474     SAT_UH2_UH(res_r, res_l, 7);
    475     PCKEV_ST_SB(res_r, res_l, dst);
    476     dst += dst_stride;
    477 
    478     ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
    479     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    480     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    481     SAT_UH2_UH(res_r, res_l, 7);
    482     PCKEV_ST_SB(res_r, res_l, dst);
    483     dst += dst_stride;
    484 
    485     ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
    486     HADD_UB2_UH(res_r, res_l, res_r, res_l);
    487     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    488     SAT_UH2_UH(res_r, res_l, 7);
    489     PCKEV_ST_SB(res_r, res_l, dst);
    490     dst += dst_stride;
    491   }
    492 }
    493 
    494 static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
    495                                        const uint8_t *src_left, uint8_t *dst,
    496                                        int32_t dst_stride) {
    497   uint8_t top_left = src_top[-1];
    498   uint32_t loop_cnt;
    499   v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
    500   v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
    501 
    502   LD_SB2(src_top, 16, src_top0, src_top1);
    503   src_top_left = (v8u16)__msa_fill_h(top_left);
    504 
    505   for (loop_cnt = 8; loop_cnt--;) {
    506     src_left0 = __msa_fill_b(src_left[0]);
    507     src_left1 = __msa_fill_b(src_left[1]);
    508     src_left2 = __msa_fill_b(src_left[2]);
    509     src_left3 = __msa_fill_b(src_left[3]);
    510     src_left += 4;
    511 
    512     ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
    513     ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
    514     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    515     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    516     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    517     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    518     PCKEV_ST_SB(res_r0, res_l0, dst);
    519     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    520     dst += dst_stride;
    521 
    522     ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
    523     ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
    524     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    525     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    526     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    527     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    528     PCKEV_ST_SB(res_r0, res_l0, dst);
    529     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    530     dst += dst_stride;
    531 
    532     ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
    533     ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
    534     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    535     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    536     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    537     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    538     PCKEV_ST_SB(res_r0, res_l0, dst);
    539     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    540     dst += dst_stride;
    541 
    542     ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
    543     ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
    544     HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    545     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    546     IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    547     SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    548     PCKEV_ST_SB(res_r0, res_l0, dst);
    549     PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    550     dst += dst_stride;
    551   }
    552 }
    553 
    554 void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    555                              const uint8_t *above, const uint8_t *left) {
    556   (void)left;
    557 
    558   intra_predict_vert_4x4_msa(above, dst, y_stride);
    559 }
    560 
    561 void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    562                              const uint8_t *above, const uint8_t *left) {
    563   (void)left;
    564 
    565   intra_predict_vert_8x8_msa(above, dst, y_stride);
    566 }
    567 
    568 void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    569                                const uint8_t *above, const uint8_t *left) {
    570   (void)left;
    571 
    572   intra_predict_vert_16x16_msa(above, dst, y_stride);
    573 }
    574 
    575 void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    576                                const uint8_t *above, const uint8_t *left) {
    577   (void)left;
    578 
    579   intra_predict_vert_32x32_msa(above, dst, y_stride);
    580 }
    581 
    582 void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    583                              const uint8_t *above, const uint8_t *left) {
    584   (void)above;
    585 
    586   intra_predict_horiz_4x4_msa(left, dst, y_stride);
    587 }
    588 
    589 void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    590                              const uint8_t *above, const uint8_t *left) {
    591   (void)above;
    592 
    593   intra_predict_horiz_8x8_msa(left, dst, y_stride);
    594 }
    595 
    596 void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    597                                const uint8_t *above, const uint8_t *left) {
    598   (void)above;
    599 
    600   intra_predict_horiz_16x16_msa(left, dst, y_stride);
    601 }
    602 
    603 void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    604                                const uint8_t *above, const uint8_t *left) {
    605   (void)above;
    606 
    607   intra_predict_horiz_32x32_msa(left, dst, y_stride);
    608 }
    609 
    610 void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    611                               const uint8_t *above, const uint8_t *left) {
    612   intra_predict_dc_4x4_msa(above, left, dst, y_stride);
    613 }
    614 
    615 void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    616                               const uint8_t *above, const uint8_t *left) {
    617   intra_predict_dc_8x8_msa(above, left, dst, y_stride);
    618 }
    619 
    620 void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    621                                 const uint8_t *above, const uint8_t *left) {
    622   intra_predict_dc_16x16_msa(above, left, dst, y_stride);
    623 }
    624 
    625 void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    626                                 const uint8_t *above, const uint8_t *left) {
    627   intra_predict_dc_32x32_msa(above, left, dst, y_stride);
    628 }
    629 
    630 void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    631                                   const uint8_t *above, const uint8_t *left) {
    632   (void)left;
    633 
    634   intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
    635 }
    636 
    637 void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    638                                   const uint8_t *above, const uint8_t *left) {
    639   (void)left;
    640 
    641   intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
    642 }
    643 
    644 void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    645                                     const uint8_t *above, const uint8_t *left) {
    646   (void)left;
    647 
    648   intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
    649 }
    650 
    651 void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    652                                     const uint8_t *above, const uint8_t *left) {
    653   (void)left;
    654 
    655   intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
    656 }
    657 
    658 void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    659                                    const uint8_t *above, const uint8_t *left) {
    660   (void)above;
    661 
    662   intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
    663 }
    664 
    665 void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    666                                    const uint8_t *above, const uint8_t *left) {
    667   (void)above;
    668 
    669   intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
    670 }
    671 
    672 void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    673                                      const uint8_t *above,
    674                                      const uint8_t *left) {
    675   (void)above;
    676 
    677   intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
    678 }
    679 
    680 void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    681                                      const uint8_t *above,
    682                                      const uint8_t *left) {
    683   (void)above;
    684 
    685   intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
    686 }
    687 
    688 void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    689                                   const uint8_t *above, const uint8_t *left) {
    690   (void)above;
    691   (void)left;
    692 
    693   intra_predict_128dc_4x4_msa(dst, y_stride);
    694 }
    695 
    696 void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    697                                   const uint8_t *above, const uint8_t *left) {
    698   (void)above;
    699   (void)left;
    700 
    701   intra_predict_128dc_8x8_msa(dst, y_stride);
    702 }
    703 
    704 void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    705                                     const uint8_t *above, const uint8_t *left) {
    706   (void)above;
    707   (void)left;
    708 
    709   intra_predict_128dc_16x16_msa(dst, y_stride);
    710 }
    711 
    712 void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    713                                     const uint8_t *above, const uint8_t *left) {
    714   (void)above;
    715   (void)left;
    716 
    717   intra_predict_128dc_32x32_msa(dst, y_stride);
    718 }
    719 
    720 void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
    721                               const uint8_t *above, const uint8_t *left) {
    722   intra_predict_tm_4x4_msa(above, left, dst, y_stride);
    723 }
    724 
    725 void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
    726                               const uint8_t *above, const uint8_t *left) {
    727   intra_predict_tm_8x8_msa(above, left, dst, y_stride);
    728 }
    729 
    730 void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
    731                                 const uint8_t *above, const uint8_t *left) {
    732   intra_predict_tm_16x16_msa(above, left, dst, y_stride);
    733 }
    734 
    735 void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
    736                                 const uint8_t *above, const uint8_t *left) {
    737   intra_predict_tm_32x32_msa(above, left, dst, y_stride);
    738 }
    739