Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_ports/mem.h"
     13 #include "vpx_dsp/mips/macros_msa.h"
     14 #include "vpx_dsp/variance.h"
     15 
     16 static const uint8_t bilinear_filters_msa[8][2] = {
     17   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
     18   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
     19 };
     20 
     21 #define CALC_MSE_AVG_B(src, ref, var, sub)                          \
     22   {                                                                 \
     23     v16u8 src_l0_m, src_l1_m;                                       \
     24     v8i16 res_l0_m, res_l1_m;                                       \
     25                                                                     \
     26     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
     27     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
     28     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
     29                                                                     \
     30     sub += res_l0_m + res_l1_m;                                     \
     31   }
     32 
     33 #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
     34 
     35 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
     36   sse - (((int64_t)diff * diff) >> shift)
     37 
     38 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
     39                                         int32_t src_stride,
     40                                         const uint8_t *ref_ptr,
     41                                         int32_t ref_stride,
     42                                         const uint8_t *sec_pred, int32_t height,
     43                                         int32_t *diff) {
     44   int32_t ht_cnt;
     45   uint32_t src0, src1, src2, src3;
     46   uint32_t ref0, ref1, ref2, ref3;
     47   v16u8 pred, src = { 0 };
     48   v16u8 ref = { 0 };
     49   v8i16 avg = { 0 };
     50   v4i32 vec, var = { 0 };
     51 
     52   for (ht_cnt = (height >> 2); ht_cnt--;) {
     53     pred = LD_UB(sec_pred);
     54     sec_pred += 16;
     55     LW4(src_ptr, src_stride, src0, src1, src2, src3);
     56     src_ptr += (4 * src_stride);
     57     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
     58     ref_ptr += (4 * ref_stride);
     59 
     60     INSERT_W4_UB(src0, src1, src2, src3, src);
     61     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
     62 
     63     src = __msa_aver_u_b(src, pred);
     64     CALC_MSE_AVG_B(src, ref, var, avg);
     65   }
     66 
     67   vec = __msa_hadd_s_w(avg, avg);
     68   *diff = HADD_SW_S32(vec);
     69 
     70   return HADD_SW_S32(var);
     71 }
     72 
     73 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
     74                                         int32_t src_stride,
     75                                         const uint8_t *ref_ptr,
     76                                         int32_t ref_stride,
     77                                         const uint8_t *sec_pred, int32_t height,
     78                                         int32_t *diff) {
     79   int32_t ht_cnt;
     80   v16u8 src0, src1, src2, src3;
     81   v16u8 ref0, ref1, ref2, ref3;
     82   v16u8 pred0, pred1;
     83   v8i16 avg = { 0 };
     84   v4i32 vec, var = { 0 };
     85 
     86   for (ht_cnt = (height >> 2); ht_cnt--;) {
     87     LD_UB2(sec_pred, 16, pred0, pred1);
     88     sec_pred += 32;
     89     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
     90     src_ptr += (4 * src_stride);
     91     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
     92     ref_ptr += (4 * ref_stride);
     93 
     94     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
     95                 ref0, ref1);
     96     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
     97     CALC_MSE_AVG_B(src0, ref0, var, avg);
     98     CALC_MSE_AVG_B(src1, ref1, var, avg);
     99   }
    100 
    101   vec = __msa_hadd_s_w(avg, avg);
    102   *diff = HADD_SW_S32(vec);
    103 
    104   return HADD_SW_S32(var);
    105 }
    106 
    107 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
    108                                          int32_t src_stride,
    109                                          const uint8_t *ref_ptr,
    110                                          int32_t ref_stride,
    111                                          const uint8_t *sec_pred,
    112                                          int32_t height, int32_t *diff) {
    113   int32_t ht_cnt;
    114   v16u8 src, ref, pred;
    115   v8i16 avg = { 0 };
    116   v4i32 vec, var = { 0 };
    117 
    118   for (ht_cnt = (height >> 2); ht_cnt--;) {
    119     pred = LD_UB(sec_pred);
    120     sec_pred += 16;
    121     src = LD_UB(src_ptr);
    122     src_ptr += src_stride;
    123     ref = LD_UB(ref_ptr);
    124     ref_ptr += ref_stride;
    125     src = __msa_aver_u_b(src, pred);
    126     CALC_MSE_AVG_B(src, ref, var, avg);
    127 
    128     pred = LD_UB(sec_pred);
    129     sec_pred += 16;
    130     src = LD_UB(src_ptr);
    131     src_ptr += src_stride;
    132     ref = LD_UB(ref_ptr);
    133     ref_ptr += ref_stride;
    134     src = __msa_aver_u_b(src, pred);
    135     CALC_MSE_AVG_B(src, ref, var, avg);
    136 
    137     pred = LD_UB(sec_pred);
    138     sec_pred += 16;
    139     src = LD_UB(src_ptr);
    140     src_ptr += src_stride;
    141     ref = LD_UB(ref_ptr);
    142     ref_ptr += ref_stride;
    143     src = __msa_aver_u_b(src, pred);
    144     CALC_MSE_AVG_B(src, ref, var, avg);
    145 
    146     pred = LD_UB(sec_pred);
    147     sec_pred += 16;
    148     src = LD_UB(src_ptr);
    149     src_ptr += src_stride;
    150     ref = LD_UB(ref_ptr);
    151     ref_ptr += ref_stride;
    152     src = __msa_aver_u_b(src, pred);
    153     CALC_MSE_AVG_B(src, ref, var, avg);
    154   }
    155 
    156   vec = __msa_hadd_s_w(avg, avg);
    157   *diff = HADD_SW_S32(vec);
    158 
    159   return HADD_SW_S32(var);
    160 }
    161 
    162 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
    163                                          int32_t src_stride,
    164                                          const uint8_t *ref_ptr,
    165                                          int32_t ref_stride,
    166                                          const uint8_t *sec_pred,
    167                                          int32_t height, int32_t *diff) {
    168   int32_t ht_cnt;
    169   v16u8 src0, src1, ref0, ref1, pred0, pred1;
    170   v8i16 avg = { 0 };
    171   v4i32 vec, var = { 0 };
    172 
    173   for (ht_cnt = (height >> 2); ht_cnt--;) {
    174     LD_UB2(sec_pred, 16, pred0, pred1);
    175     sec_pred += 32;
    176     LD_UB2(src_ptr, 16, src0, src1);
    177     src_ptr += src_stride;
    178     LD_UB2(ref_ptr, 16, ref0, ref1);
    179     ref_ptr += ref_stride;
    180     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    181     CALC_MSE_AVG_B(src0, ref0, var, avg);
    182     CALC_MSE_AVG_B(src1, ref1, var, avg);
    183 
    184     LD_UB2(sec_pred, 16, pred0, pred1);
    185     sec_pred += 32;
    186     LD_UB2(src_ptr, 16, src0, src1);
    187     src_ptr += src_stride;
    188     LD_UB2(ref_ptr, 16, ref0, ref1);
    189     ref_ptr += ref_stride;
    190     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    191     CALC_MSE_AVG_B(src0, ref0, var, avg);
    192     CALC_MSE_AVG_B(src1, ref1, var, avg);
    193 
    194     LD_UB2(sec_pred, 16, pred0, pred1);
    195     sec_pred += 32;
    196     LD_UB2(src_ptr, 16, src0, src1);
    197     src_ptr += src_stride;
    198     LD_UB2(ref_ptr, 16, ref0, ref1);
    199     ref_ptr += ref_stride;
    200     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    201     CALC_MSE_AVG_B(src0, ref0, var, avg);
    202     CALC_MSE_AVG_B(src1, ref1, var, avg);
    203 
    204     LD_UB2(sec_pred, 16, pred0, pred1);
    205     sec_pred += 32;
    206     LD_UB2(src_ptr, 16, src0, src1);
    207     src_ptr += src_stride;
    208     LD_UB2(ref_ptr, 16, ref0, ref1);
    209     ref_ptr += ref_stride;
    210     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    211     CALC_MSE_AVG_B(src0, ref0, var, avg);
    212     CALC_MSE_AVG_B(src1, ref1, var, avg);
    213   }
    214 
    215   vec = __msa_hadd_s_w(avg, avg);
    216   *diff = HADD_SW_S32(vec);
    217 
    218   return HADD_SW_S32(var);
    219 }
    220 
    221 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
    222                                        int32_t src_stride,
    223                                        const uint8_t *ref_ptr,
    224                                        int32_t ref_stride,
    225                                        const uint8_t *sec_pred, int32_t *diff) {
    226   int32_t ht_cnt;
    227   v16u8 src0, src1, ref0, ref1, pred0, pred1;
    228   v8i16 avg0 = { 0 };
    229   v8i16 avg1 = { 0 };
    230   v4i32 vec, var = { 0 };
    231 
    232   for (ht_cnt = 16; ht_cnt--;) {
    233     LD_UB2(sec_pred, 16, pred0, pred1);
    234     sec_pred += 32;
    235     LD_UB2(src_ptr, 16, src0, src1);
    236     src_ptr += src_stride;
    237     LD_UB2(ref_ptr, 16, ref0, ref1);
    238     ref_ptr += ref_stride;
    239     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    240     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    241     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    242 
    243     LD_UB2(sec_pred, 16, pred0, pred1);
    244     sec_pred += 32;
    245     LD_UB2(src_ptr, 16, src0, src1);
    246     src_ptr += src_stride;
    247     LD_UB2(ref_ptr, 16, ref0, ref1);
    248     ref_ptr += ref_stride;
    249     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    250     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    251     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    252 
    253     LD_UB2(sec_pred, 16, pred0, pred1);
    254     sec_pred += 32;
    255     LD_UB2(src_ptr, 16, src0, src1);
    256     src_ptr += src_stride;
    257     LD_UB2(ref_ptr, 16, ref0, ref1);
    258     ref_ptr += ref_stride;
    259     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    260     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    261     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    262 
    263     LD_UB2(sec_pred, 16, pred0, pred1);
    264     sec_pred += 32;
    265     LD_UB2(src_ptr, 16, src0, src1);
    266     src_ptr += src_stride;
    267     LD_UB2(ref_ptr, 16, ref0, ref1);
    268     ref_ptr += ref_stride;
    269     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
    270     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    271     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    272   }
    273 
    274   vec = __msa_hadd_s_w(avg0, avg0);
    275   vec += __msa_hadd_s_w(avg1, avg1);
    276   *diff = HADD_SW_S32(vec);
    277 
    278   return HADD_SW_S32(var);
    279 }
    280 
    281 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
    282                                        int32_t src_stride,
    283                                        const uint8_t *ref_ptr,
    284                                        int32_t ref_stride,
    285                                        const uint8_t *sec_pred, int32_t *diff) {
    286   int32_t ht_cnt;
    287   v16u8 src0, src1, src2, src3;
    288   v16u8 ref0, ref1, ref2, ref3;
    289   v16u8 pred0, pred1, pred2, pred3;
    290   v8i16 avg0 = { 0 };
    291   v8i16 avg1 = { 0 };
    292   v4i32 vec, var = { 0 };
    293 
    294   for (ht_cnt = 16; ht_cnt--;) {
    295     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    296     sec_pred += 64;
    297     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
    298     src_ptr += src_stride;
    299     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
    300     ref_ptr += ref_stride;
    301     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
    302                 src2, src3);
    303     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    304     CALC_MSE_AVG_B(src2, ref2, var, avg0);
    305     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    306     CALC_MSE_AVG_B(src3, ref3, var, avg1);
    307 
    308     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    309     sec_pred += 64;
    310     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
    311     src_ptr += src_stride;
    312     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
    313     ref_ptr += ref_stride;
    314     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
    315                 src2, src3);
    316     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    317     CALC_MSE_AVG_B(src2, ref2, var, avg0);
    318     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    319     CALC_MSE_AVG_B(src3, ref3, var, avg1);
    320   }
    321 
    322   vec = __msa_hadd_s_w(avg0, avg0);
    323   vec += __msa_hadd_s_w(avg1, avg1);
    324 
    325   *diff = HADD_SW_S32(vec);
    326 
    327   return HADD_SW_S32(var);
    328 }
    329 
    330 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
    331                                        int32_t src_stride,
    332                                        const uint8_t *ref_ptr,
    333                                        int32_t ref_stride,
    334                                        const uint8_t *sec_pred, int32_t *diff) {
    335   int32_t ht_cnt;
    336   v16u8 src0, src1, src2, src3;
    337   v16u8 ref0, ref1, ref2, ref3;
    338   v16u8 pred0, pred1, pred2, pred3;
    339   v8i16 avg0 = { 0 };
    340   v8i16 avg1 = { 0 };
    341   v8i16 avg2 = { 0 };
    342   v8i16 avg3 = { 0 };
    343   v4i32 vec, var = { 0 };
    344 
    345   for (ht_cnt = 32; ht_cnt--;) {
    346     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    347     sec_pred += 64;
    348     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
    349     src_ptr += src_stride;
    350     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
    351     ref_ptr += ref_stride;
    352     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
    353                 src2, src3);
    354     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    355     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    356     CALC_MSE_AVG_B(src2, ref2, var, avg2);
    357     CALC_MSE_AVG_B(src3, ref3, var, avg3);
    358 
    359     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    360     sec_pred += 64;
    361     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
    362     src_ptr += src_stride;
    363     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
    364     ref_ptr += ref_stride;
    365     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
    366                 src2, src3);
    367     CALC_MSE_AVG_B(src0, ref0, var, avg0);
    368     CALC_MSE_AVG_B(src1, ref1, var, avg1);
    369     CALC_MSE_AVG_B(src2, ref2, var, avg2);
    370     CALC_MSE_AVG_B(src3, ref3, var, avg3);
    371   }
    372 
    373   vec = __msa_hadd_s_w(avg0, avg0);
    374   vec += __msa_hadd_s_w(avg1, avg1);
    375   vec += __msa_hadd_s_w(avg2, avg2);
    376   vec += __msa_hadd_s_w(avg3, avg3);
    377   *diff = HADD_SW_S32(vec);
    378 
    379   return HADD_SW_S32(var);
    380 }
    381 
    382 static uint32_t sub_pixel_sse_diff_4width_h_msa(
    383     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    384     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    385   int16_t filtval;
    386   uint32_t loop_cnt;
    387   uint32_t ref0, ref1, ref2, ref3;
    388   v16u8 filt0, ref = { 0 };
    389   v16i8 src0, src1, src2, src3;
    390   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
    391   v8u16 vec0, vec1, vec2, vec3;
    392   v8i16 avg = { 0 };
    393   v4i32 vec, var = { 0 };
    394 
    395   filtval = LH(filter);
    396   filt0 = (v16u8)__msa_fill_h(filtval);
    397 
    398   for (loop_cnt = (height >> 2); loop_cnt--;) {
    399     LD_SB4(src, src_stride, src0, src1, src2, src3);
    400     src += (4 * src_stride);
    401     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
    402     dst += (4 * dst_stride);
    403     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    404     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    405     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    406     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    407                 vec2, vec3);
    408     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    409     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
    410                 src2, src3);
    411     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
    412     src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
    413     CALC_MSE_AVG_B(src0, ref, var, avg);
    414   }
    415 
    416   vec = __msa_hadd_s_w(avg, avg);
    417   *diff = HADD_SW_S32(vec);
    418 
    419   return HADD_SW_S32(var);
    420 }
    421 
    422 static uint32_t sub_pixel_sse_diff_8width_h_msa(
    423     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    424     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    425   int16_t filtval;
    426   uint32_t loop_cnt;
    427   v16u8 filt0, out, ref0, ref1, ref2, ref3;
    428   v16i8 src0, src1, src2, src3;
    429   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
    430   v8u16 vec0, vec1, vec2, vec3;
    431   v8i16 avg = { 0 };
    432   v4i32 vec, var = { 0 };
    433 
    434   filtval = LH(filter);
    435   filt0 = (v16u8)__msa_fill_h(filtval);
    436 
    437   for (loop_cnt = (height >> 2); loop_cnt--;) {
    438     LD_SB4(src, src_stride, src0, src1, src2, src3);
    439     src += (4 * src_stride);
    440     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
    441     dst += (4 * dst_stride);
    442 
    443     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
    444     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    445     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    446     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    447                 vec2, vec3);
    448     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    449     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
    450                 src2, src3);
    451     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
    452     CALC_MSE_AVG_B(out, ref0, var, avg);
    453     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
    454     CALC_MSE_AVG_B(out, ref1, var, avg);
    455   }
    456 
    457   vec = __msa_hadd_s_w(avg, avg);
    458   *diff = HADD_SW_S32(vec);
    459 
    460   return HADD_SW_S32(var);
    461 }
    462 
    463 static uint32_t sub_pixel_sse_diff_16width_h_msa(
    464     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    465     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    466   int16_t filtval;
    467   uint32_t loop_cnt;
    468   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
    469   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
    470   v16u8 dst0, dst1, dst2, dst3, filt0;
    471   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    472   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
    473   v8i16 avg = { 0 };
    474   v4i32 vec, var = { 0 };
    475 
    476   filtval = LH(filter);
    477   filt0 = (v16u8)__msa_fill_h(filtval);
    478 
    479   for (loop_cnt = (height >> 2); loop_cnt--;) {
    480     LD_SB4(src, src_stride, src0, src2, src4, src6);
    481     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    482     src += (4 * src_stride);
    483     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    484     dst += (4 * dst_stride);
    485 
    486     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    487     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    488     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
    489     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
    490     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    491                 out2, out3);
    492     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    493                 out6, out7);
    494     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
    495     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
    496     PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
    497                 src2, src3);
    498     CALC_MSE_AVG_B(src0, dst0, var, avg);
    499     CALC_MSE_AVG_B(src1, dst1, var, avg);
    500     CALC_MSE_AVG_B(src2, dst2, var, avg);
    501     CALC_MSE_AVG_B(src3, dst3, var, avg);
    502   }
    503 
    504   vec = __msa_hadd_s_w(avg, avg);
    505   *diff = HADD_SW_S32(vec);
    506 
    507   return HADD_SW_S32(var);
    508 }
    509 
    510 static uint32_t sub_pixel_sse_diff_32width_h_msa(
    511     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    512     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    513   uint32_t loop_cnt, sse = 0;
    514   int32_t diff0[2];
    515 
    516   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
    517     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
    518                                             filter, height, &diff0[loop_cnt]);
    519     src += 16;
    520     dst += 16;
    521   }
    522 
    523   *diff = diff0[0] + diff0[1];
    524 
    525   return sse;
    526 }
    527 
    528 static uint32_t sub_pixel_sse_diff_64width_h_msa(
    529     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    530     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    531   uint32_t loop_cnt, sse = 0;
    532   int32_t diff0[4];
    533 
    534   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
    535     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
    536                                             filter, height, &diff0[loop_cnt]);
    537     src += 16;
    538     dst += 16;
    539   }
    540 
    541   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
    542 
    543   return sse;
    544 }
    545 
    546 static uint32_t sub_pixel_sse_diff_4width_v_msa(
    547     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    548     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    549   int16_t filtval;
    550   uint32_t loop_cnt;
    551   uint32_t ref0, ref1, ref2, ref3;
    552   v16u8 src0, src1, src2, src3, src4, out;
    553   v16u8 src10_r, src32_r, src21_r, src43_r;
    554   v16u8 ref = { 0 };
    555   v16u8 src2110, src4332;
    556   v16u8 filt0;
    557   v8i16 avg = { 0 };
    558   v4i32 vec, var = { 0 };
    559   v8u16 tmp0, tmp1;
    560 
    561   filtval = LH(filter);
    562   filt0 = (v16u8)__msa_fill_h(filtval);
    563 
    564   src0 = LD_UB(src);
    565   src += src_stride;
    566 
    567   for (loop_cnt = (height >> 2); loop_cnt--;) {
    568     LD_UB4(src, src_stride, src1, src2, src3, src4);
    569     src += (4 * src_stride);
    570     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
    571     dst += (4 * dst_stride);
    572 
    573     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    574     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
    575                src32_r, src43_r);
    576     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
    577     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
    578     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    579     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    580     CALC_MSE_AVG_B(out, ref, var, avg);
    581     src0 = src4;
    582   }
    583 
    584   vec = __msa_hadd_s_w(avg, avg);
    585   *diff = HADD_SW_S32(vec);
    586 
    587   return HADD_SW_S32(var);
    588 }
    589 
    590 static uint32_t sub_pixel_sse_diff_8width_v_msa(
    591     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    592     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    593   int16_t filtval;
    594   uint32_t loop_cnt;
    595   v16u8 src0, src1, src2, src3, src4;
    596   v16u8 ref0, ref1, ref2, ref3;
    597   v8u16 vec0, vec1, vec2, vec3;
    598   v8u16 tmp0, tmp1, tmp2, tmp3;
    599   v16u8 filt0;
    600   v8i16 avg = { 0 };
    601   v4i32 vec, var = { 0 };
    602 
    603   filtval = LH(filter);
    604   filt0 = (v16u8)__msa_fill_h(filtval);
    605 
    606   src0 = LD_UB(src);
    607   src += src_stride;
    608 
    609   for (loop_cnt = (height >> 2); loop_cnt--;) {
    610     LD_UB4(src, src_stride, src1, src2, src3, src4);
    611     src += (4 * src_stride);
    612     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
    613     dst += (4 * dst_stride);
    614 
    615     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
    616     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
    617                vec3);
    618     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
    619                 tmp2, tmp3);
    620     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
    621     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
    622     CALC_MSE_AVG_B(src0, ref0, var, avg);
    623     CALC_MSE_AVG_B(src1, ref1, var, avg);
    624     src0 = src4;
    625   }
    626 
    627   vec = __msa_hadd_s_w(avg, avg);
    628   *diff = HADD_SW_S32(vec);
    629 
    630   return HADD_SW_S32(var);
    631 }
    632 
    633 static uint32_t sub_pixel_sse_diff_16width_v_msa(
    634     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    635     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    636   int16_t filtval;
    637   uint32_t loop_cnt;
    638   v16u8 ref0, ref1, ref2, ref3;
    639   v16u8 src0, src1, src2, src3, src4;
    640   v16u8 out0, out1, out2, out3;
    641   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    642   v8u16 tmp0, tmp1, tmp2, tmp3;
    643   v16u8 filt0;
    644   v8i16 avg = { 0 };
    645   v4i32 vec, var = { 0 };
    646 
    647   filtval = LH(filter);
    648   filt0 = (v16u8)__msa_fill_h(filtval);
    649 
    650   src0 = LD_UB(src);
    651   src += src_stride;
    652 
    653   for (loop_cnt = (height >> 2); loop_cnt--;) {
    654     LD_UB4(src, src_stride, src1, src2, src3, src4);
    655     src += (4 * src_stride);
    656     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
    657     dst += (4 * dst_stride);
    658 
    659     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
    660     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
    661     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
    662     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    663     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    664 
    665     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
    666     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
    667     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
    668     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
    669     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
    670 
    671     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
    672     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    673     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    674     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
    675     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
    676     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
    677 
    678     src0 = src4;
    679 
    680     CALC_MSE_AVG_B(out0, ref0, var, avg);
    681     CALC_MSE_AVG_B(out1, ref1, var, avg);
    682     CALC_MSE_AVG_B(out2, ref2, var, avg);
    683     CALC_MSE_AVG_B(out3, ref3, var, avg);
    684   }
    685 
    686   vec = __msa_hadd_s_w(avg, avg);
    687   *diff = HADD_SW_S32(vec);
    688 
    689   return HADD_SW_S32(var);
    690 }
    691 
    692 static uint32_t sub_pixel_sse_diff_32width_v_msa(
    693     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    694     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    695   uint32_t loop_cnt, sse = 0;
    696   int32_t diff0[2];
    697 
    698   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
    699     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
    700                                             filter, height, &diff0[loop_cnt]);
    701     src += 16;
    702     dst += 16;
    703   }
    704 
    705   *diff = diff0[0] + diff0[1];
    706 
    707   return sse;
    708 }
    709 
    710 static uint32_t sub_pixel_sse_diff_64width_v_msa(
    711     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    712     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
    713   uint32_t loop_cnt, sse = 0;
    714   int32_t diff0[4];
    715 
    716   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
    717     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
    718                                             filter, height, &diff0[loop_cnt]);
    719     src += 16;
    720     dst += 16;
    721   }
    722 
    723   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
    724 
    725   return sse;
    726 }
    727 
    728 static uint32_t sub_pixel_sse_diff_4width_hv_msa(
    729     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    730     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
    731     int32_t height, int32_t *diff) {
    732   int16_t filtval;
    733   uint32_t loop_cnt;
    734   uint32_t ref0, ref1, ref2, ref3;
    735   v16u8 src0, src1, src2, src3, src4;
    736   v16u8 out, ref = { 0 };
    737   v16u8 filt_vt, filt_hz, vec0, vec1;
    738   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
    739   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
    740   v8u16 tmp0, tmp1;
    741   v8i16 avg = { 0 };
    742   v4i32 vec, var = { 0 };
    743 
    744   filtval = LH(filter_horiz);
    745   filt_hz = (v16u8)__msa_fill_h(filtval);
    746   filtval = LH(filter_vert);
    747   filt_vt = (v16u8)__msa_fill_h(filtval);
    748 
    749   src0 = LD_UB(src);
    750   src += src_stride;
    751 
    752   for (loop_cnt = (height >> 2); loop_cnt--;) {
    753     LD_UB4(src, src_stride, src1, src2, src3, src4);
    754     src += (4 * src_stride);
    755     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
    756     dst += (4 * dst_stride);
    757     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    758     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
    759     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
    760     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
    761     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
    762     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
    763     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    764     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    765     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    766     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    767     CALC_MSE_AVG_B(out, ref, var, avg);
    768     src0 = src4;
    769   }
    770 
    771   vec = __msa_hadd_s_w(avg, avg);
    772   *diff = HADD_SW_S32(vec);
    773 
    774   return HADD_SW_S32(var);
    775 }
    776 
    777 static uint32_t sub_pixel_sse_diff_8width_hv_msa(
    778     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    779     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
    780     int32_t height, int32_t *diff) {
    781   int16_t filtval;
    782   uint32_t loop_cnt;
    783   v16u8 ref0, ref1, ref2, ref3;
    784   v16u8 src0, src1, src2, src3, src4;
    785   v16u8 out0, out1;
    786   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
    787   v8u16 hz_out0, hz_out1;
    788   v8u16 tmp0, tmp1, tmp2, tmp3;
    789   v16u8 filt_vt, filt_hz, vec0;
    790   v8i16 avg = { 0 };
    791   v4i32 vec, var = { 0 };
    792 
    793   filtval = LH(filter_horiz);
    794   filt_hz = (v16u8)__msa_fill_h(filtval);
    795   filtval = LH(filter_vert);
    796   filt_vt = (v16u8)__msa_fill_h(filtval);
    797 
    798   src0 = LD_UB(src);
    799   src += src_stride;
    800   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
    801 
    802   for (loop_cnt = (height >> 2); loop_cnt--;) {
    803     LD_UB4(src, src_stride, src1, src2, src3, src4);
    804     src += (4 * src_stride);
    805     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
    806     dst += (4 * dst_stride);
    807 
    808     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
    809     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
    810     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    811     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
    812     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
    813     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    814     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
    815     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    816     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
    817     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    818     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
    819     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
    820     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    821     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
    822     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
    823     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    824     CALC_MSE_AVG_B(out0, ref0, var, avg);
    825     CALC_MSE_AVG_B(out1, ref1, var, avg);
    826   }
    827 
    828   vec = __msa_hadd_s_w(avg, avg);
    829   *diff = HADD_SW_S32(vec);
    830 
    831   return HADD_SW_S32(var);
    832 }
    833 
    834 static uint32_t sub_pixel_sse_diff_16width_hv_msa(
    835     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    836     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
    837     int32_t height, int32_t *diff) {
    838   int16_t filtval;
    839   uint32_t loop_cnt;
    840   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    841   v16u8 ref0, ref1, ref2, ref3;
    842   v16u8 filt_hz, filt_vt, vec0, vec1;
    843   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
    844   v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
    845   v8u16 tmp0, tmp1;
    846   v8i16 avg = { 0 };
    847   v4i32 vec, var = { 0 };
    848 
    849   filtval = LH(filter_horiz);
    850   filt_hz = (v16u8)__msa_fill_h(filtval);
    851   filtval = LH(filter_vert);
    852   filt_vt = (v16u8)__msa_fill_h(filtval);
    853 
    854   LD_UB2(src, 8, src0, src1);
    855   src += src_stride;
    856 
    857   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
    858   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
    859 
    860   for (loop_cnt = (height >> 2); loop_cnt--;) {
    861     LD_UB4(src, src_stride, src0, src2, src4, src6);
    862     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
    863     src += (4 * src_stride);
    864     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
    865     dst += (4 * dst_stride);
    866 
    867     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
    868     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
    869     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    870     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    871     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    872     src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    873 
    874     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
    875     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
    876     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
    877     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    878     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    879     src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    880 
    881     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
    882     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
    883     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    884     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    885     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    886     src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    887 
    888     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
    889     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
    890     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
    891     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    892     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
    893     src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    894 
    895     CALC_MSE_AVG_B(src0, ref0, var, avg);
    896     CALC_MSE_AVG_B(src1, ref1, var, avg);
    897     CALC_MSE_AVG_B(src2, ref2, var, avg);
    898     CALC_MSE_AVG_B(src3, ref3, var, avg);
    899   }
    900 
    901   vec = __msa_hadd_s_w(avg, avg);
    902   *diff = HADD_SW_S32(vec);
    903 
    904   return HADD_SW_S32(var);
    905 }
    906 
    907 static uint32_t sub_pixel_sse_diff_32width_hv_msa(
    908     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    909     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
    910     int32_t height, int32_t *diff) {
    911   uint32_t loop_cnt, sse = 0;
    912   int32_t diff0[2];
    913 
    914   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
    915     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
    916                                              filter_horiz, filter_vert, height,
    917                                              &diff0[loop_cnt]);
    918     src += 16;
    919     dst += 16;
    920   }
    921 
    922   *diff = diff0[0] + diff0[1];
    923 
    924   return sse;
    925 }
    926 
    927 static uint32_t sub_pixel_sse_diff_64width_hv_msa(
    928     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    929     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
    930     int32_t height, int32_t *diff) {
    931   uint32_t loop_cnt, sse = 0;
    932   int32_t diff0[4];
    933 
    934   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
    935     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
    936                                              filter_horiz, filter_vert, height,
    937                                              &diff0[loop_cnt]);
    938     src += 16;
    939     dst += 16;
    940   }
    941 
    942   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
    943 
    944   return sse;
    945 }
    946 
    947 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
    948     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    949     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
    950     int32_t height, int32_t *diff) {
    951   int16_t filtval;
    952   uint32_t loop_cnt;
    953   uint32_t ref0, ref1, ref2, ref3;
    954   v16u8 out, pred, filt0, ref = { 0 };
    955   v16i8 src0, src1, src2, src3;
    956   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
    957   v8u16 vec0, vec1, vec2, vec3;
    958   v8i16 avg = { 0 };
    959   v4i32 vec, var = { 0 };
    960 
    961   filtval = LH(filter);
    962   filt0 = (v16u8)__msa_fill_h(filtval);
    963 
    964   for (loop_cnt = (height >> 2); loop_cnt--;) {
    965     LD_SB4(src, src_stride, src0, src1, src2, src3);
    966     src += (4 * src_stride);
    967     pred = LD_UB(sec_pred);
    968     sec_pred += 16;
    969     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
    970     dst += (4 * dst_stride);
    971 
    972     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    973     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    974     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    975     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    976                 vec2, vec3);
    977     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    978     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
    979                 src2, src3);
    980     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
    981     out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
    982     out = __msa_aver_u_b(out, pred);
    983     CALC_MSE_AVG_B(out, ref, var, avg);
    984   }
    985 
    986   vec = __msa_hadd_s_w(avg, avg);
    987   *diff = HADD_SW_S32(vec);
    988 
    989   return HADD_SW_S32(var);
    990 }
    991 
    992 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
    993     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
    994     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
    995     int32_t height, int32_t *diff) {
    996   int16_t filtval;
    997   uint32_t loop_cnt;
    998   v16u8 out, pred, filt0;
    999   v16u8 ref0, ref1, ref2, ref3;
   1000   v16i8 src0, src1, src2, src3;
   1001   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
   1002   v8u16 vec0, vec1, vec2, vec3;
   1003   v8i16 avg = { 0 };
   1004   v4i32 vec, var = { 0 };
   1005 
   1006   filtval = LH(filter);
   1007   filt0 = (v16u8)__msa_fill_h(filtval);
   1008 
   1009   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1010     LD_SB4(src, src_stride, src0, src1, src2, src3);
   1011     src += (4 * src_stride);
   1012     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1013     dst += (4 * dst_stride);
   1014 
   1015     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
   1016     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
   1017     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
   1018     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
   1019                 vec2, vec3);
   1020     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
   1021     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
   1022                 src2, src3);
   1023     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
   1024 
   1025     pred = LD_UB(sec_pred);
   1026     sec_pred += 16;
   1027     out = __msa_aver_u_b(out, pred);
   1028     CALC_MSE_AVG_B(out, ref0, var, avg);
   1029     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
   1030     pred = LD_UB(sec_pred);
   1031     sec_pred += 16;
   1032     out = __msa_aver_u_b(out, pred);
   1033     CALC_MSE_AVG_B(out, ref1, var, avg);
   1034   }
   1035 
   1036   vec = __msa_hadd_s_w(avg, avg);
   1037   *diff = HADD_SW_S32(vec);
   1038 
   1039   return HADD_SW_S32(var);
   1040 }
   1041 
   1042 static uint32_t subpel_avg_ssediff_16w_h_msa(
   1043     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1044     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1045     int32_t height, int32_t *diff, int32_t width) {
   1046   int16_t filtval;
   1047   uint32_t loop_cnt;
   1048   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
   1049   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
   1050   v16u8 dst0, dst1, dst2, dst3;
   1051   v16u8 tmp0, tmp1, tmp2, tmp3;
   1052   v16u8 pred0, pred1, pred2, pred3, filt0;
   1053   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   1054   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
   1055   v8i16 avg = { 0 };
   1056   v4i32 vec, var = { 0 };
   1057 
   1058   filtval = LH(filter);
   1059   filt0 = (v16u8)__msa_fill_h(filtval);
   1060 
   1061   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1062     LD_SB4(src, src_stride, src0, src2, src4, src6);
   1063     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
   1064     src += (4 * src_stride);
   1065     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
   1066     dst += (4 * dst_stride);
   1067     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
   1068     sec_pred += (4 * width);
   1069 
   1070     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
   1071     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
   1072     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
   1073     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
   1074     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
   1075                 out2, out3);
   1076     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
   1077                 out6, out7);
   1078     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
   1079     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
   1080     PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
   1081                 tmp2, tmp3);
   1082     AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
   1083                 tmp2, tmp3);
   1084 
   1085     CALC_MSE_AVG_B(tmp0, dst0, var, avg);
   1086     CALC_MSE_AVG_B(tmp1, dst1, var, avg);
   1087     CALC_MSE_AVG_B(tmp2, dst2, var, avg);
   1088     CALC_MSE_AVG_B(tmp3, dst3, var, avg);
   1089   }
   1090 
   1091   vec = __msa_hadd_s_w(avg, avg);
   1092   *diff = HADD_SW_S32(vec);
   1093 
   1094   return HADD_SW_S32(var);
   1095 }
   1096 
   1097 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
   1098     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1099     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1100     int32_t height, int32_t *diff) {
   1101   return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
   1102                                       sec_pred, filter, height, diff, 16);
   1103 }
   1104 
   1105 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
   1106     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1107     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1108     int32_t height, int32_t *diff) {
   1109   uint32_t loop_cnt, sse = 0;
   1110   int32_t diff0[2];
   1111 
   1112   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
   1113     sse +=
   1114         subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
   1115                                      filter, height, &diff0[loop_cnt], 32);
   1116     src += 16;
   1117     dst += 16;
   1118     sec_pred += 16;
   1119   }
   1120 
   1121   *diff = diff0[0] + diff0[1];
   1122 
   1123   return sse;
   1124 }
   1125 
   1126 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
   1127     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1128     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1129     int32_t height, int32_t *diff) {
   1130   uint32_t loop_cnt, sse = 0;
   1131   int32_t diff0[4];
   1132 
   1133   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
   1134     sse +=
   1135         subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
   1136                                      filter, height, &diff0[loop_cnt], 64);
   1137     src += 16;
   1138     dst += 16;
   1139     sec_pred += 16;
   1140   }
   1141 
   1142   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
   1143 
   1144   return sse;
   1145 }
   1146 
   1147 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
   1148     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1149     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1150     int32_t height, int32_t *diff) {
   1151   int16_t filtval;
   1152   uint32_t loop_cnt;
   1153   uint32_t ref0, ref1, ref2, ref3;
   1154   v16u8 src0, src1, src2, src3, src4;
   1155   v16u8 src10_r, src32_r, src21_r, src43_r;
   1156   v16u8 out, pred, ref = { 0 };
   1157   v16u8 src2110, src4332, filt0;
   1158   v8i16 avg = { 0 };
   1159   v4i32 vec, var = { 0 };
   1160   v8u16 tmp0, tmp1;
   1161 
   1162   filtval = LH(filter);
   1163   filt0 = (v16u8)__msa_fill_h(filtval);
   1164 
   1165   src0 = LD_UB(src);
   1166   src += src_stride;
   1167 
   1168   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1169     LD_UB4(src, src_stride, src1, src2, src3, src4);
   1170     src += (4 * src_stride);
   1171     pred = LD_UB(sec_pred);
   1172     sec_pred += 16;
   1173     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1174     dst += (4 * dst_stride);
   1175 
   1176     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
   1177     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
   1178                src32_r, src43_r);
   1179     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
   1180     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
   1181     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1182 
   1183     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1184     out = __msa_aver_u_b(out, pred);
   1185     CALC_MSE_AVG_B(out, ref, var, avg);
   1186     src0 = src4;
   1187   }
   1188 
   1189   vec = __msa_hadd_s_w(avg, avg);
   1190   *diff = HADD_SW_S32(vec);
   1191 
   1192   return HADD_SW_S32(var);
   1193 }
   1194 
   1195 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
   1196     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1197     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1198     int32_t height, int32_t *diff) {
   1199   int16_t filtval;
   1200   uint32_t loop_cnt;
   1201   v16u8 src0, src1, src2, src3, src4;
   1202   v16u8 ref0, ref1, ref2, ref3;
   1203   v16u8 pred0, pred1, filt0;
   1204   v8u16 vec0, vec1, vec2, vec3;
   1205   v8u16 tmp0, tmp1, tmp2, tmp3;
   1206   v8i16 avg = { 0 };
   1207   v4i32 vec, var = { 0 };
   1208 
   1209   filtval = LH(filter);
   1210   filt0 = (v16u8)__msa_fill_h(filtval);
   1211 
   1212   src0 = LD_UB(src);
   1213   src += src_stride;
   1214 
   1215   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1216     LD_UB4(src, src_stride, src1, src2, src3, src4);
   1217     src += (4 * src_stride);
   1218     LD_UB2(sec_pred, 16, pred0, pred1);
   1219     sec_pred += 32;
   1220     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1221     dst += (4 * dst_stride);
   1222     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
   1223     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
   1224                vec3);
   1225     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
   1226                 tmp2, tmp3);
   1227     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
   1228     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
   1229     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
   1230     CALC_MSE_AVG_B(src0, ref0, var, avg);
   1231     CALC_MSE_AVG_B(src1, ref1, var, avg);
   1232 
   1233     src0 = src4;
   1234   }
   1235 
   1236   vec = __msa_hadd_s_w(avg, avg);
   1237   *diff = HADD_SW_S32(vec);
   1238 
   1239   return HADD_SW_S32(var);
   1240 }
   1241 
   1242 static uint32_t subpel_avg_ssediff_16w_v_msa(
   1243     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1244     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1245     int32_t height, int32_t *diff, int32_t width) {
   1246   int16_t filtval;
   1247   uint32_t loop_cnt;
   1248   v16u8 ref0, ref1, ref2, ref3;
   1249   v16u8 pred0, pred1, pred2, pred3;
   1250   v16u8 src0, src1, src2, src3, src4;
   1251   v16u8 out0, out1, out2, out3, filt0;
   1252   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   1253   v8u16 tmp0, tmp1, tmp2, tmp3;
   1254   v8i16 avg = { 0 };
   1255   v4i32 vec, var = { 0 };
   1256 
   1257   filtval = LH(filter);
   1258   filt0 = (v16u8)__msa_fill_h(filtval);
   1259 
   1260   src0 = LD_UB(src);
   1261   src += src_stride;
   1262 
   1263   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1264     LD_UB4(src, src_stride, src1, src2, src3, src4);
   1265     src += (4 * src_stride);
   1266     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
   1267     sec_pred += (4 * width);
   1268 
   1269     ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
   1270     ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
   1271     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
   1272     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1273     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1274 
   1275     ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
   1276     ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
   1277     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
   1278     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
   1279     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
   1280 
   1281     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
   1282     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1283     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1284 
   1285     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
   1286     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
   1287     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
   1288 
   1289     src0 = src4;
   1290     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1291     dst += (4 * dst_stride);
   1292 
   1293     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
   1294                 out2, out3);
   1295 
   1296     CALC_MSE_AVG_B(out0, ref0, var, avg);
   1297     CALC_MSE_AVG_B(out1, ref1, var, avg);
   1298     CALC_MSE_AVG_B(out2, ref2, var, avg);
   1299     CALC_MSE_AVG_B(out3, ref3, var, avg);
   1300   }
   1301 
   1302   vec = __msa_hadd_s_w(avg, avg);
   1303   *diff = HADD_SW_S32(vec);
   1304 
   1305   return HADD_SW_S32(var);
   1306 }
   1307 
   1308 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
   1309     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1310     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1311     int32_t height, int32_t *diff) {
   1312   return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
   1313                                       sec_pred, filter, height, diff, 16);
   1314 }
   1315 
   1316 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
   1317     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1318     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1319     int32_t height, int32_t *diff) {
   1320   uint32_t loop_cnt, sse = 0;
   1321   int32_t diff0[2];
   1322 
   1323   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
   1324     sse +=
   1325         subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
   1326                                      filter, height, &diff0[loop_cnt], 32);
   1327     src += 16;
   1328     dst += 16;
   1329     sec_pred += 16;
   1330   }
   1331 
   1332   *diff = diff0[0] + diff0[1];
   1333 
   1334   return sse;
   1335 }
   1336 
   1337 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
   1338     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1339     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
   1340     int32_t height, int32_t *diff) {
   1341   uint32_t loop_cnt, sse = 0;
   1342   int32_t diff0[4];
   1343 
   1344   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
   1345     sse +=
   1346         subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
   1347                                      filter, height, &diff0[loop_cnt], 64);
   1348     src += 16;
   1349     dst += 16;
   1350     sec_pred += 16;
   1351   }
   1352 
   1353   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
   1354 
   1355   return sse;
   1356 }
   1357 
   1358 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
   1359     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1360     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
   1361     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   1362   int16_t filtval;
   1363   uint32_t loop_cnt;
   1364   uint32_t ref0, ref1, ref2, ref3;
   1365   v16u8 src0, src1, src2, src3, src4;
   1366   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
   1367   v16u8 filt_hz, filt_vt, vec0, vec1;
   1368   v16u8 out, pred, ref = { 0 };
   1369   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
   1370   v8i16 avg = { 0 };
   1371   v4i32 vec, var = { 0 };
   1372 
   1373   filtval = LH(filter_horiz);
   1374   filt_hz = (v16u8)__msa_fill_h(filtval);
   1375   filtval = LH(filter_vert);
   1376   filt_vt = (v16u8)__msa_fill_h(filtval);
   1377 
   1378   src0 = LD_UB(src);
   1379   src += src_stride;
   1380 
   1381   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1382     LD_UB4(src, src_stride, src1, src2, src3, src4);
   1383     src += (4 * src_stride);
   1384     pred = LD_UB(sec_pred);
   1385     sec_pred += 16;
   1386     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1387     dst += (4 * dst_stride);
   1388     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
   1389     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
   1390     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
   1391     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
   1392     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
   1393     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
   1394     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   1395     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   1396     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1397     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1398     out = __msa_aver_u_b(out, pred);
   1399     CALC_MSE_AVG_B(out, ref, var, avg);
   1400     src0 = src4;
   1401   }
   1402 
   1403   vec = __msa_hadd_s_w(avg, avg);
   1404   *diff = HADD_SW_S32(vec);
   1405 
   1406   return HADD_SW_S32(var);
   1407 }
   1408 
   1409 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
   1410     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1411     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
   1412     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   1413   int16_t filtval;
   1414   uint32_t loop_cnt;
   1415   v16u8 ref0, ref1, ref2, ref3;
   1416   v16u8 src0, src1, src2, src3, src4;
   1417   v16u8 pred0, pred1, out0, out1;
   1418   v16u8 filt_hz, filt_vt, vec0;
   1419   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
   1420   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   1421   v8i16 avg = { 0 };
   1422   v4i32 vec, var = { 0 };
   1423 
   1424   filtval = LH(filter_horiz);
   1425   filt_hz = (v16u8)__msa_fill_h(filtval);
   1426   filtval = LH(filter_vert);
   1427   filt_vt = (v16u8)__msa_fill_h(filtval);
   1428 
   1429   src0 = LD_UB(src);
   1430   src += src_stride;
   1431   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   1432 
   1433   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1434     LD_UB4(src, src_stride, src1, src2, src3, src4);
   1435     src += (4 * src_stride);
   1436     LD_UB2(sec_pred, 16, pred0, pred1);
   1437     sec_pred += 32;
   1438     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1439     dst += (4 * dst_stride);
   1440 
   1441     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
   1442     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
   1443 
   1444     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
   1445     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
   1446     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
   1447 
   1448     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
   1449     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
   1450     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1451     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
   1452 
   1453     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
   1454     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
   1455     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
   1456 
   1457     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
   1458     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
   1459 
   1460     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
   1461     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
   1462     AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
   1463 
   1464     CALC_MSE_AVG_B(out0, ref0, var, avg);
   1465     CALC_MSE_AVG_B(out1, ref1, var, avg);
   1466   }
   1467 
   1468   vec = __msa_hadd_s_w(avg, avg);
   1469   *diff = HADD_SW_S32(vec);
   1470 
   1471   return HADD_SW_S32(var);
   1472 }
   1473 
   1474 static uint32_t subpel_avg_ssediff_16w_hv_msa(
   1475     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1476     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
   1477     const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
   1478   int16_t filtval;
   1479   uint32_t loop_cnt;
   1480   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   1481   v16u8 ref0, ref1, ref2, ref3;
   1482   v16u8 pred0, pred1, pred2, pred3;
   1483   v16u8 out0, out1, out2, out3;
   1484   v16u8 filt_hz, filt_vt, vec0, vec1;
   1485   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
   1486   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
   1487   v8i16 avg = { 0 };
   1488   v4i32 vec, var = { 0 };
   1489 
   1490   filtval = LH(filter_horiz);
   1491   filt_hz = (v16u8)__msa_fill_h(filtval);
   1492   filtval = LH(filter_vert);
   1493   filt_vt = (v16u8)__msa_fill_h(filtval);
   1494 
   1495   LD_UB2(src, 8, src0, src1);
   1496   src += src_stride;
   1497 
   1498   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   1499   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
   1500 
   1501   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1502     LD_UB4(src, src_stride, src0, src2, src4, src6);
   1503     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
   1504     src += (4 * src_stride);
   1505     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
   1506     sec_pred += (4 * width);
   1507 
   1508     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   1509     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
   1510     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   1511     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   1512     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1513     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1514 
   1515     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
   1516     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
   1517     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
   1518     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   1519     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1520     out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1521 
   1522     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
   1523     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
   1524     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   1525     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   1526     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1527     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1528 
   1529     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
   1530     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
   1531     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
   1532     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   1533     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
   1534     out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   1535 
   1536     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
   1537     dst += (4 * dst_stride);
   1538 
   1539     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
   1540                 out2, out3);
   1541 
   1542     CALC_MSE_AVG_B(out0, ref0, var, avg);
   1543     CALC_MSE_AVG_B(out1, ref1, var, avg);
   1544     CALC_MSE_AVG_B(out2, ref2, var, avg);
   1545     CALC_MSE_AVG_B(out3, ref3, var, avg);
   1546   }
   1547 
   1548   vec = __msa_hadd_s_w(avg, avg);
   1549   *diff = HADD_SW_S32(vec);
   1550 
   1551   return HADD_SW_S32(var);
   1552 }
   1553 
   1554 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
   1555     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1556     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
   1557     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   1558   return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
   1559                                        sec_pred, filter_horiz, filter_vert,
   1560                                        height, diff, 16);
   1561 }
   1562 
   1563 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
   1564     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1565     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
   1566     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   1567   uint32_t loop_cnt, sse = 0;
   1568   int32_t diff0[2];
   1569 
   1570   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
   1571     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
   1572                                          sec_pred, filter_horiz, filter_vert,
   1573                                          height, &diff0[loop_cnt], 32);
   1574     src += 16;
   1575     dst += 16;
   1576     sec_pred += 16;
   1577   }
   1578 
   1579   *diff = diff0[0] + diff0[1];
   1580 
   1581   return sse;
   1582 }
   1583 
   1584 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
   1585     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
   1586     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
   1587     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   1588   uint32_t loop_cnt, sse = 0;
   1589   int32_t diff0[4];
   1590 
   1591   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
   1592     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
   1593                                          sec_pred, filter_horiz, filter_vert,
   1594                                          height, &diff0[loop_cnt], 64);
   1595     src += 16;
   1596     dst += 16;
   1597     sec_pred += 16;
   1598   }
   1599 
   1600   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
   1601 
   1602   return sse;
   1603 }
   1604 
   1605 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
   1606 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
   1607 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
   1608 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
   1609 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
   1610 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
   1611 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
   1612 
   1613 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
   1614 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
   1615 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
   1616 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
   1617 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
   1618 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
   1619 
   1620 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
   1621   uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
   1622       const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
   1623       int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
   1624       uint32_t *sse) {                                                        \
   1625     int32_t diff;                                                             \
   1626     uint32_t var;                                                             \
   1627     const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
   1628     const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
   1629                                                                               \
   1630     if (yoffset) {                                                            \
   1631       if (xoffset) {                                                          \
   1632         *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
   1633             src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
   1634       } else {                                                                \
   1635         *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
   1636             src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
   1637       }                                                                       \
   1638                                                                               \
   1639       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
   1640     } else {                                                                  \
   1641       if (xoffset) {                                                          \
   1642         *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
   1643             src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
   1644                                                                               \
   1645         var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
   1646       } else {                                                                \
   1647         var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
   1648                                             sse);                             \
   1649       }                                                                       \
   1650     }                                                                         \
   1651                                                                               \
   1652     return var;                                                               \
   1653   }
   1654 
   1655 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
   1656 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
   1657 
   1658 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
   1659 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
   1660 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
   1661 
   1662 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
   1663 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
   1664 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
   1665 
   1666 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
   1667 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
   1668 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
   1669 
   1670 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
   1671 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
   1672 
   1673 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
   1674   uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
   1675       const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
   1676       int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
   1677       uint32_t *sse, const uint8_t *sec_pred) {                               \
   1678     int32_t diff;                                                             \
   1679     const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
   1680     const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
   1681                                                                               \
   1682     if (yoffset) {                                                            \
   1683       if (xoffset) {                                                          \
   1684         *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
   1685             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
   1686             v_filter, ht, &diff);                                             \
   1687       } else {                                                                \
   1688         *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
   1689             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
   1690             &diff);                                                           \
   1691       }                                                                       \
   1692     } else {                                                                  \
   1693       if (xoffset) {                                                          \
   1694         *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
   1695             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
   1696             &diff);                                                           \
   1697       } else {                                                                \
   1698         *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
   1699                                             ref_stride, sec_pred, ht, &diff); \
   1700       }                                                                       \
   1701     }                                                                         \
   1702                                                                               \
   1703     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
   1704   }
   1705 
   1706 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
   1707 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
   1708 
   1709 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
   1710 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
   1711 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
   1712 
   1713 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
   1714 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
   1715 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
   1716 
   1717 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
   1718 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
   1719 
   1720 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
   1721                                              int32_t src_stride,
   1722                                              int32_t xoffset, int32_t yoffset,
   1723                                              const uint8_t *ref_ptr,
   1724                                              int32_t ref_stride, uint32_t *sse,
   1725                                              const uint8_t *sec_pred) {
   1726   int32_t diff;
   1727   const uint8_t *h_filter = bilinear_filters_msa[xoffset];
   1728   const uint8_t *v_filter = bilinear_filters_msa[yoffset];
   1729 
   1730   if (yoffset) {
   1731     if (xoffset) {
   1732       *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
   1733           src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
   1734           v_filter, 64, &diff);
   1735     } else {
   1736       *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
   1737                                                   ref_stride, sec_pred,
   1738                                                   v_filter, 64, &diff);
   1739     }
   1740   } else {
   1741     if (xoffset) {
   1742       *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
   1743                                                   ref_stride, sec_pred,
   1744                                                   h_filter, 64, &diff);
   1745     } else {
   1746       *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
   1747                                     sec_pred, &diff);
   1748     }
   1749   }
   1750 
   1751   return VARIANCE_32Wx64H(*sse, diff);
   1752 }
   1753 
   1754 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
   1755   uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
   1756       const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
   1757       int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
   1758       uint32_t *sse, const uint8_t *sec_pred) {                               \
   1759     int32_t diff;                                                             \
   1760     const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
   1761     const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
   1762                                                                               \
   1763     if (yoffset) {                                                            \
   1764       if (xoffset) {                                                          \
   1765         *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
   1766             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
   1767             v_filter, ht, &diff);                                             \
   1768       } else {                                                                \
   1769         *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
   1770             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
   1771             &diff);                                                           \
   1772       }                                                                       \
   1773     } else {                                                                  \
   1774       if (xoffset) {                                                          \
   1775         *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
   1776             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
   1777             &diff);                                                           \
   1778       } else {                                                                \
   1779         *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
   1780                                           ref_stride, sec_pred, &diff);       \
   1781       }                                                                       \
   1782     }                                                                         \
   1783                                                                               \
   1784     return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
   1785   }
   1786 
   1787 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
   1788 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
   1789