Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/mips/macros_msa.h"
     13 
     14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
     15   {                                                        \
     16     out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
     17     out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
     18     out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
     19     out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
     20   }
     21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
     22 
     23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
     24                                const uint8_t *ref_ptr, int32_t ref_stride,
     25                                int32_t height) {
     26   int32_t ht_cnt;
     27   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
     28   v16u8 src = { 0 };
     29   v16u8 ref = { 0 };
     30   v16u8 diff;
     31   v8u16 sad = { 0 };
     32 
     33   for (ht_cnt = (height >> 2); ht_cnt--;) {
     34     LW4(src_ptr, src_stride, src0, src1, src2, src3);
     35     src_ptr += (4 * src_stride);
     36     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
     37     ref_ptr += (4 * ref_stride);
     38 
     39     INSERT_W4_UB(src0, src1, src2, src3, src);
     40     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
     41 
     42     diff = __msa_asub_u_b(src, ref);
     43     sad += __msa_hadd_u_h(diff, diff);
     44   }
     45 
     46   return HADD_UH_U32(sad);
     47 }
     48 
     49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
     50                                const uint8_t *ref, int32_t ref_stride,
     51                                int32_t height) {
     52   int32_t ht_cnt;
     53   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
     54   v8u16 sad = { 0 };
     55 
     56   for (ht_cnt = (height >> 2); ht_cnt--;) {
     57     LD_UB4(src, src_stride, src0, src1, src2, src3);
     58     src += (4 * src_stride);
     59     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
     60     ref += (4 * ref_stride);
     61 
     62     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
     63                 ref0, ref1);
     64     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
     65   }
     66 
     67   return HADD_UH_U32(sad);
     68 }
     69 
     70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
     71                                 const uint8_t *ref, int32_t ref_stride,
     72                                 int32_t height) {
     73   int32_t ht_cnt;
     74   v16u8 src0, src1, ref0, ref1;
     75   v8u16 sad = { 0 };
     76 
     77   for (ht_cnt = (height >> 2); ht_cnt--;) {
     78     LD_UB2(src, src_stride, src0, src1);
     79     src += (2 * src_stride);
     80     LD_UB2(ref, ref_stride, ref0, ref1);
     81     ref += (2 * ref_stride);
     82     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
     83 
     84     LD_UB2(src, src_stride, src0, src1);
     85     src += (2 * src_stride);
     86     LD_UB2(ref, ref_stride, ref0, ref1);
     87     ref += (2 * ref_stride);
     88     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
     89   }
     90 
     91   return HADD_UH_U32(sad);
     92 }
     93 
     94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
     95                                 const uint8_t *ref, int32_t ref_stride,
     96                                 int32_t height) {
     97   int32_t ht_cnt;
     98   v16u8 src0, src1, ref0, ref1;
     99   v8u16 sad = { 0 };
    100 
    101   for (ht_cnt = (height >> 2); ht_cnt--;) {
    102     LD_UB2(src, 16, src0, src1);
    103     src += src_stride;
    104     LD_UB2(ref, 16, ref0, ref1);
    105     ref += ref_stride;
    106     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
    107 
    108     LD_UB2(src, 16, src0, src1);
    109     src += src_stride;
    110     LD_UB2(ref, 16, ref0, ref1);
    111     ref += ref_stride;
    112     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
    113 
    114     LD_UB2(src, 16, src0, src1);
    115     src += src_stride;
    116     LD_UB2(ref, 16, ref0, ref1);
    117     ref += ref_stride;
    118     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
    119 
    120     LD_UB2(src, 16, src0, src1);
    121     src += src_stride;
    122     LD_UB2(ref, 16, ref0, ref1);
    123     ref += ref_stride;
    124     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
    125   }
    126 
    127   return HADD_UH_U32(sad);
    128 }
    129 
    130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
    131                                 const uint8_t *ref, int32_t ref_stride,
    132                                 int32_t height) {
    133   int32_t ht_cnt;
    134   uint32_t sad = 0;
    135   v16u8 src0, src1, src2, src3;
    136   v16u8 ref0, ref1, ref2, ref3;
    137   v8u16 sad0 = { 0 };
    138   v8u16 sad1 = { 0 };
    139 
    140   for (ht_cnt = (height >> 1); ht_cnt--;) {
    141     LD_UB4(src, 16, src0, src1, src2, src3);
    142     src += src_stride;
    143     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
    144     ref += ref_stride;
    145     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    146     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
    147 
    148     LD_UB4(src, 16, src0, src1, src2, src3);
    149     src += src_stride;
    150     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
    151     ref += ref_stride;
    152     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    153     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
    154   }
    155 
    156   sad = HADD_UH_U32(sad0);
    157   sad += HADD_UH_U32(sad1);
    158 
    159   return sad;
    160 }
    161 
    162 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
    163                               const uint8_t *ref_ptr, int32_t ref_stride,
    164                               int32_t height, uint32_t *sad_array) {
    165   int32_t ht_cnt;
    166   uint32_t src0, src1, src2, src3;
    167   v16u8 src = { 0 };
    168   v16u8 ref = { 0 };
    169   v16u8 ref0, ref1, ref2, ref3, diff;
    170   v8u16 sad0 = { 0 };
    171   v8u16 sad1 = { 0 };
    172   v8u16 sad2 = { 0 };
    173 
    174   for (ht_cnt = (height >> 2); ht_cnt--;) {
    175     LW4(src_ptr, src_stride, src0, src1, src2, src3);
    176     src_ptr += (4 * src_stride);
    177     INSERT_W4_UB(src0, src1, src2, src3, src);
    178 
    179     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
    180     ref_ptr += (4 * ref_stride);
    181     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    182     diff = __msa_asub_u_b(src, ref);
    183     sad0 += __msa_hadd_u_h(diff, diff);
    184 
    185     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    186     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    187     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    188     diff = __msa_asub_u_b(src, ref);
    189     sad1 += __msa_hadd_u_h(diff, diff);
    190 
    191     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    192     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    193     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    194     diff = __msa_asub_u_b(src, ref);
    195     sad2 += __msa_hadd_u_h(diff, diff);
    196   }
    197 
    198   sad_array[0] = HADD_UH_U32(sad0);
    199   sad_array[1] = HADD_UH_U32(sad1);
    200   sad_array[2] = HADD_UH_U32(sad2);
    201 }
    202 
    203 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
    204                               const uint8_t *ref, int32_t ref_stride,
    205                               int32_t height, uint32_t *sad_array) {
    206   int32_t ht_cnt;
    207   v16u8 src0, src1, src2, src3;
    208   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
    209   v8u16 sad0 = { 0 };
    210   v8u16 sad1 = { 0 };
    211   v8u16 sad2 = { 0 };
    212 
    213   for (ht_cnt = (height >> 2); ht_cnt--;) {
    214     LD_UB4(src, src_stride, src0, src1, src2, src3);
    215     src += (4 * src_stride);
    216     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
    217     ref += (4 * ref_stride);
    218     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
    219                 ref0, ref1);
    220     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    221 
    222     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    223     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    224     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    225     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
    226 
    227     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    228     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    229     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    230     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
    231   }
    232 
    233   sad_array[0] = HADD_UH_U32(sad0);
    234   sad_array[1] = HADD_UH_U32(sad1);
    235   sad_array[2] = HADD_UH_U32(sad2);
    236 }
    237 
    238 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
    239                                const uint8_t *ref_ptr, int32_t ref_stride,
    240                                int32_t height, uint32_t *sad_array) {
    241   int32_t ht_cnt;
    242   v16u8 src, ref, ref0, ref1, diff;
    243   v8u16 sad0 = { 0 };
    244   v8u16 sad1 = { 0 };
    245   v8u16 sad2 = { 0 };
    246 
    247   for (ht_cnt = (height >> 1); ht_cnt--;) {
    248     src = LD_UB(src_ptr);
    249     src_ptr += src_stride;
    250     LD_UB2(ref_ptr, 16, ref0, ref1);
    251     ref_ptr += ref_stride;
    252 
    253     diff = __msa_asub_u_b(src, ref0);
    254     sad0 += __msa_hadd_u_h(diff, diff);
    255 
    256     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
    257     diff = __msa_asub_u_b(src, ref);
    258     sad1 += __msa_hadd_u_h(diff, diff);
    259 
    260     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
    261     diff = __msa_asub_u_b(src, ref);
    262     sad2 += __msa_hadd_u_h(diff, diff);
    263 
    264     src = LD_UB(src_ptr);
    265     src_ptr += src_stride;
    266     LD_UB2(ref_ptr, 16, ref0, ref1);
    267     ref_ptr += ref_stride;
    268 
    269     diff = __msa_asub_u_b(src, ref0);
    270     sad0 += __msa_hadd_u_h(diff, diff);
    271 
    272     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
    273     diff = __msa_asub_u_b(src, ref);
    274     sad1 += __msa_hadd_u_h(diff, diff);
    275 
    276     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
    277     diff = __msa_asub_u_b(src, ref);
    278     sad2 += __msa_hadd_u_h(diff, diff);
    279   }
    280 
    281   sad_array[0] = HADD_UH_U32(sad0);
    282   sad_array[1] = HADD_UH_U32(sad1);
    283   sad_array[2] = HADD_UH_U32(sad2);
    284 }
    285 
    286 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
    287                               const uint8_t *ref_ptr, int32_t ref_stride,
    288                               int32_t height, uint32_t *sad_array) {
    289   int32_t ht_cnt;
    290   uint32_t src0, src1, src2, src3;
    291   v16u8 ref0, ref1, ref2, ref3, diff;
    292   v16u8 src = { 0 };
    293   v16u8 ref = { 0 };
    294   v8u16 sad0 = { 0 };
    295   v8u16 sad1 = { 0 };
    296   v8u16 sad2 = { 0 };
    297   v8u16 sad3 = { 0 };
    298   v8u16 sad4 = { 0 };
    299   v8u16 sad5 = { 0 };
    300   v8u16 sad6 = { 0 };
    301   v8u16 sad7 = { 0 };
    302 
    303   for (ht_cnt = (height >> 2); ht_cnt--;) {
    304     LW4(src_ptr, src_stride, src0, src1, src2, src3);
    305     INSERT_W4_UB(src0, src1, src2, src3, src);
    306     src_ptr += (4 * src_stride);
    307     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
    308     ref_ptr += (4 * ref_stride);
    309 
    310     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    311     diff = __msa_asub_u_b(src, ref);
    312     sad0 += __msa_hadd_u_h(diff, diff);
    313 
    314     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    315     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    316     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    317     diff = __msa_asub_u_b(src, ref);
    318     sad1 += __msa_hadd_u_h(diff, diff);
    319 
    320     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    321     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    322     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    323     diff = __msa_asub_u_b(src, ref);
    324     sad2 += __msa_hadd_u_h(diff, diff);
    325 
    326     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    327     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    328     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    329     diff = __msa_asub_u_b(src, ref);
    330     sad3 += __msa_hadd_u_h(diff, diff);
    331 
    332     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    333     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    334     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    335     diff = __msa_asub_u_b(src, ref);
    336     sad4 += __msa_hadd_u_h(diff, diff);
    337 
    338     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    339     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    340     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    341     diff = __msa_asub_u_b(src, ref);
    342     sad5 += __msa_hadd_u_h(diff, diff);
    343 
    344     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    345     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    346     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    347     diff = __msa_asub_u_b(src, ref);
    348     sad6 += __msa_hadd_u_h(diff, diff);
    349 
    350     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
    351     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
    352     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
    353     diff = __msa_asub_u_b(src, ref);
    354     sad7 += __msa_hadd_u_h(diff, diff);
    355   }
    356 
    357   sad_array[0] = HADD_UH_U32(sad0);
    358   sad_array[1] = HADD_UH_U32(sad1);
    359   sad_array[2] = HADD_UH_U32(sad2);
    360   sad_array[3] = HADD_UH_U32(sad3);
    361   sad_array[4] = HADD_UH_U32(sad4);
    362   sad_array[5] = HADD_UH_U32(sad5);
    363   sad_array[6] = HADD_UH_U32(sad6);
    364   sad_array[7] = HADD_UH_U32(sad7);
    365 }
    366 
    367 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
    368                               const uint8_t *ref, int32_t ref_stride,
    369                               int32_t height, uint32_t *sad_array) {
    370   int32_t ht_cnt;
    371   v16u8 src0, src1, src2, src3;
    372   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
    373   v8u16 sad0 = { 0 };
    374   v8u16 sad1 = { 0 };
    375   v8u16 sad2 = { 0 };
    376   v8u16 sad3 = { 0 };
    377   v8u16 sad4 = { 0 };
    378   v8u16 sad5 = { 0 };
    379   v8u16 sad6 = { 0 };
    380   v8u16 sad7 = { 0 };
    381 
    382   for (ht_cnt = (height >> 2); ht_cnt--;) {
    383     LD_UB4(src, src_stride, src0, src1, src2, src3);
    384     src += (4 * src_stride);
    385     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
    386     ref += (4 * ref_stride);
    387     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
    388                 ref0, ref1);
    389     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    390 
    391     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    392     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    393     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    394     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
    395 
    396     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    397     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    398     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    399     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
    400 
    401     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    402     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    403     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    404     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
    405 
    406     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    407     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    408     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    409     sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
    410 
    411     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    412     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    413     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    414     sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
    415 
    416     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    417     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    418     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    419     sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
    420 
    421     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
    422     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
    423     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
    424     sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
    425   }
    426 
    427   sad_array[0] = HADD_UH_U32(sad0);
    428   sad_array[1] = HADD_UH_U32(sad1);
    429   sad_array[2] = HADD_UH_U32(sad2);
    430   sad_array[3] = HADD_UH_U32(sad3);
    431   sad_array[4] = HADD_UH_U32(sad4);
    432   sad_array[5] = HADD_UH_U32(sad5);
    433   sad_array[6] = HADD_UH_U32(sad6);
    434   sad_array[7] = HADD_UH_U32(sad7);
    435 }
    436 
    437 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
    438                                const uint8_t *ref_ptr, int32_t ref_stride,
    439                                int32_t height, uint32_t *sad_array) {
    440   int32_t ht_cnt;
    441   v16u8 src, ref0, ref1, ref;
    442   v16u8 diff;
    443   v8u16 sad0 = { 0 };
    444   v8u16 sad1 = { 0 };
    445   v8u16 sad2 = { 0 };
    446   v8u16 sad3 = { 0 };
    447   v8u16 sad4 = { 0 };
    448   v8u16 sad5 = { 0 };
    449   v8u16 sad6 = { 0 };
    450   v8u16 sad7 = { 0 };
    451 
    452   for (ht_cnt = (height >> 1); ht_cnt--;) {
    453     src = LD_UB(src_ptr);
    454     src_ptr += src_stride;
    455     LD_UB2(ref_ptr, 16, ref0, ref1);
    456     ref_ptr += ref_stride;
    457 
    458     diff = __msa_asub_u_b(src, ref0);
    459     sad0 += __msa_hadd_u_h(diff, diff);
    460 
    461     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
    462     diff = __msa_asub_u_b(src, ref);
    463     sad1 += __msa_hadd_u_h(diff, diff);
    464 
    465     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
    466     diff = __msa_asub_u_b(src, ref);
    467     sad2 += __msa_hadd_u_h(diff, diff);
    468 
    469     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
    470     diff = __msa_asub_u_b(src, ref);
    471     sad3 += __msa_hadd_u_h(diff, diff);
    472 
    473     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
    474     diff = __msa_asub_u_b(src, ref);
    475     sad4 += __msa_hadd_u_h(diff, diff);
    476 
    477     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
    478     diff = __msa_asub_u_b(src, ref);
    479     sad5 += __msa_hadd_u_h(diff, diff);
    480 
    481     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
    482     diff = __msa_asub_u_b(src, ref);
    483     sad6 += __msa_hadd_u_h(diff, diff);
    484 
    485     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
    486     diff = __msa_asub_u_b(src, ref);
    487     sad7 += __msa_hadd_u_h(diff, diff);
    488 
    489     src = LD_UB(src_ptr);
    490     src_ptr += src_stride;
    491     LD_UB2(ref_ptr, 16, ref0, ref1);
    492     ref_ptr += ref_stride;
    493 
    494     diff = __msa_asub_u_b(src, ref0);
    495     sad0 += __msa_hadd_u_h(diff, diff);
    496 
    497     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
    498     diff = __msa_asub_u_b(src, ref);
    499     sad1 += __msa_hadd_u_h(diff, diff);
    500 
    501     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
    502     diff = __msa_asub_u_b(src, ref);
    503     sad2 += __msa_hadd_u_h(diff, diff);
    504 
    505     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
    506     diff = __msa_asub_u_b(src, ref);
    507     sad3 += __msa_hadd_u_h(diff, diff);
    508 
    509     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
    510     diff = __msa_asub_u_b(src, ref);
    511     sad4 += __msa_hadd_u_h(diff, diff);
    512 
    513     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
    514     diff = __msa_asub_u_b(src, ref);
    515     sad5 += __msa_hadd_u_h(diff, diff);
    516 
    517     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
    518     diff = __msa_asub_u_b(src, ref);
    519     sad6 += __msa_hadd_u_h(diff, diff);
    520 
    521     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
    522     diff = __msa_asub_u_b(src, ref);
    523     sad7 += __msa_hadd_u_h(diff, diff);
    524   }
    525 
    526   sad_array[0] = HADD_UH_U32(sad0);
    527   sad_array[1] = HADD_UH_U32(sad1);
    528   sad_array[2] = HADD_UH_U32(sad2);
    529   sad_array[3] = HADD_UH_U32(sad3);
    530   sad_array[4] = HADD_UH_U32(sad4);
    531   sad_array[5] = HADD_UH_U32(sad5);
    532   sad_array[6] = HADD_UH_U32(sad6);
    533   sad_array[7] = HADD_UH_U32(sad7);
    534 }
    535 
    536 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
    537                                const uint8_t *const aref_ptr[],
    538                                int32_t ref_stride, int32_t height,
    539                                uint32_t *sad_array) {
    540   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
    541   int32_t ht_cnt;
    542   uint32_t src0, src1, src2, src3;
    543   uint32_t ref0, ref1, ref2, ref3;
    544   v16u8 src = { 0 };
    545   v16u8 ref = { 0 };
    546   v16u8 diff;
    547   v8u16 sad0 = { 0 };
    548   v8u16 sad1 = { 0 };
    549   v8u16 sad2 = { 0 };
    550   v8u16 sad3 = { 0 };
    551 
    552   ref0_ptr = aref_ptr[0];
    553   ref1_ptr = aref_ptr[1];
    554   ref2_ptr = aref_ptr[2];
    555   ref3_ptr = aref_ptr[3];
    556 
    557   for (ht_cnt = (height >> 2); ht_cnt--;) {
    558     LW4(src_ptr, src_stride, src0, src1, src2, src3);
    559     INSERT_W4_UB(src0, src1, src2, src3, src);
    560     src_ptr += (4 * src_stride);
    561 
    562     LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
    563     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    564     ref0_ptr += (4 * ref_stride);
    565 
    566     diff = __msa_asub_u_b(src, ref);
    567     sad0 += __msa_hadd_u_h(diff, diff);
    568 
    569     LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
    570     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    571     ref1_ptr += (4 * ref_stride);
    572 
    573     diff = __msa_asub_u_b(src, ref);
    574     sad1 += __msa_hadd_u_h(diff, diff);
    575 
    576     LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
    577     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    578     ref2_ptr += (4 * ref_stride);
    579 
    580     diff = __msa_asub_u_b(src, ref);
    581     sad2 += __msa_hadd_u_h(diff, diff);
    582 
    583     LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
    584     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    585     ref3_ptr += (4 * ref_stride);
    586 
    587     diff = __msa_asub_u_b(src, ref);
    588     sad3 += __msa_hadd_u_h(diff, diff);
    589   }
    590 
    591   sad_array[0] = HADD_UH_U32(sad0);
    592   sad_array[1] = HADD_UH_U32(sad1);
    593   sad_array[2] = HADD_UH_U32(sad2);
    594   sad_array[3] = HADD_UH_U32(sad3);
    595 }
    596 
    597 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
    598                                const uint8_t *const aref_ptr[],
    599                                int32_t ref_stride, int32_t height,
    600                                uint32_t *sad_array) {
    601   int32_t ht_cnt;
    602   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
    603   v16u8 src0, src1, src2, src3;
    604   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
    605   v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
    606   v8u16 sad0 = { 0 };
    607   v8u16 sad1 = { 0 };
    608   v8u16 sad2 = { 0 };
    609   v8u16 sad3 = { 0 };
    610 
    611   ref0_ptr = aref_ptr[0];
    612   ref1_ptr = aref_ptr[1];
    613   ref2_ptr = aref_ptr[2];
    614   ref3_ptr = aref_ptr[3];
    615 
    616   for (ht_cnt = (height >> 2); ht_cnt--;) {
    617     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
    618     src_ptr += (4 * src_stride);
    619     LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
    620     ref0_ptr += (4 * ref_stride);
    621     LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
    622     ref1_ptr += (4 * ref_stride);
    623     LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
    624     ref2_ptr += (4 * ref_stride);
    625     LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
    626     ref3_ptr += (4 * ref_stride);
    627 
    628     PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
    629     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
    630     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    631 
    632     PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
    633     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
    634 
    635     PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
    636     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
    637 
    638     PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
    639     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
    640   }
    641 
    642   sad_array[0] = HADD_UH_U32(sad0);
    643   sad_array[1] = HADD_UH_U32(sad1);
    644   sad_array[2] = HADD_UH_U32(sad2);
    645   sad_array[3] = HADD_UH_U32(sad3);
    646 }
    647 
    648 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
    649                                 const uint8_t *const aref_ptr[],
    650                                 int32_t ref_stride, int32_t height,
    651                                 uint32_t *sad_array) {
    652   int32_t ht_cnt;
    653   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
    654   v16u8 src, ref0, ref1, ref2, ref3, diff;
    655   v8u16 sad0 = { 0 };
    656   v8u16 sad1 = { 0 };
    657   v8u16 sad2 = { 0 };
    658   v8u16 sad3 = { 0 };
    659 
    660   ref0_ptr = aref_ptr[0];
    661   ref1_ptr = aref_ptr[1];
    662   ref2_ptr = aref_ptr[2];
    663   ref3_ptr = aref_ptr[3];
    664 
    665   for (ht_cnt = (height >> 1); ht_cnt--;) {
    666     src = LD_UB(src_ptr);
    667     src_ptr += src_stride;
    668     ref0 = LD_UB(ref0_ptr);
    669     ref0_ptr += ref_stride;
    670     ref1 = LD_UB(ref1_ptr);
    671     ref1_ptr += ref_stride;
    672     ref2 = LD_UB(ref2_ptr);
    673     ref2_ptr += ref_stride;
    674     ref3 = LD_UB(ref3_ptr);
    675     ref3_ptr += ref_stride;
    676 
    677     diff = __msa_asub_u_b(src, ref0);
    678     sad0 += __msa_hadd_u_h(diff, diff);
    679     diff = __msa_asub_u_b(src, ref1);
    680     sad1 += __msa_hadd_u_h(diff, diff);
    681     diff = __msa_asub_u_b(src, ref2);
    682     sad2 += __msa_hadd_u_h(diff, diff);
    683     diff = __msa_asub_u_b(src, ref3);
    684     sad3 += __msa_hadd_u_h(diff, diff);
    685 
    686     src = LD_UB(src_ptr);
    687     src_ptr += src_stride;
    688     ref0 = LD_UB(ref0_ptr);
    689     ref0_ptr += ref_stride;
    690     ref1 = LD_UB(ref1_ptr);
    691     ref1_ptr += ref_stride;
    692     ref2 = LD_UB(ref2_ptr);
    693     ref2_ptr += ref_stride;
    694     ref3 = LD_UB(ref3_ptr);
    695     ref3_ptr += ref_stride;
    696 
    697     diff = __msa_asub_u_b(src, ref0);
    698     sad0 += __msa_hadd_u_h(diff, diff);
    699     diff = __msa_asub_u_b(src, ref1);
    700     sad1 += __msa_hadd_u_h(diff, diff);
    701     diff = __msa_asub_u_b(src, ref2);
    702     sad2 += __msa_hadd_u_h(diff, diff);
    703     diff = __msa_asub_u_b(src, ref3);
    704     sad3 += __msa_hadd_u_h(diff, diff);
    705   }
    706 
    707   sad_array[0] = HADD_UH_U32(sad0);
    708   sad_array[1] = HADD_UH_U32(sad1);
    709   sad_array[2] = HADD_UH_U32(sad2);
    710   sad_array[3] = HADD_UH_U32(sad3);
    711 }
    712 
    713 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
    714                                 const uint8_t *const aref_ptr[],
    715                                 int32_t ref_stride, int32_t height,
    716                                 uint32_t *sad_array) {
    717   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
    718   int32_t ht_cnt;
    719   v16u8 src0, src1, ref0, ref1;
    720   v8u16 sad0 = { 0 };
    721   v8u16 sad1 = { 0 };
    722   v8u16 sad2 = { 0 };
    723   v8u16 sad3 = { 0 };
    724 
    725   ref0_ptr = aref_ptr[0];
    726   ref1_ptr = aref_ptr[1];
    727   ref2_ptr = aref_ptr[2];
    728   ref3_ptr = aref_ptr[3];
    729 
    730   for (ht_cnt = height; ht_cnt--;) {
    731     LD_UB2(src, 16, src0, src1);
    732     src += src_stride;
    733 
    734     LD_UB2(ref0_ptr, 16, ref0, ref1);
    735     ref0_ptr += ref_stride;
    736     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    737 
    738     LD_UB2(ref1_ptr, 16, ref0, ref1);
    739     ref1_ptr += ref_stride;
    740     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
    741 
    742     LD_UB2(ref2_ptr, 16, ref0, ref1);
    743     ref2_ptr += ref_stride;
    744     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
    745 
    746     LD_UB2(ref3_ptr, 16, ref0, ref1);
    747     ref3_ptr += ref_stride;
    748     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
    749   }
    750 
    751   sad_array[0] = HADD_UH_U32(sad0);
    752   sad_array[1] = HADD_UH_U32(sad1);
    753   sad_array[2] = HADD_UH_U32(sad2);
    754   sad_array[3] = HADD_UH_U32(sad3);
    755 }
    756 
    757 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
    758                                 const uint8_t *const aref_ptr[],
    759                                 int32_t ref_stride, int32_t height,
    760                                 uint32_t *sad_array) {
    761   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
    762   int32_t ht_cnt;
    763   v16u8 src0, src1, src2, src3;
    764   v16u8 ref0, ref1, ref2, ref3;
    765   v8u16 sad0_0 = { 0 };
    766   v8u16 sad0_1 = { 0 };
    767   v8u16 sad1_0 = { 0 };
    768   v8u16 sad1_1 = { 0 };
    769   v8u16 sad2_0 = { 0 };
    770   v8u16 sad2_1 = { 0 };
    771   v8u16 sad3_0 = { 0 };
    772   v8u16 sad3_1 = { 0 };
    773   v4u32 sad;
    774 
    775   ref0_ptr = aref_ptr[0];
    776   ref1_ptr = aref_ptr[1];
    777   ref2_ptr = aref_ptr[2];
    778   ref3_ptr = aref_ptr[3];
    779 
    780   for (ht_cnt = height; ht_cnt--;) {
    781     LD_UB4(src, 16, src0, src1, src2, src3);
    782     src += src_stride;
    783 
    784     LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
    785     ref0_ptr += ref_stride;
    786     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    787     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
    788 
    789     LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
    790     ref1_ptr += ref_stride;
    791     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    792     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
    793 
    794     LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
    795     ref2_ptr += ref_stride;
    796     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    797     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
    798 
    799     LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
    800     ref3_ptr += ref_stride;
    801     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
    802     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
    803   }
    804 
    805   sad = __msa_hadd_u_w(sad0_0, sad0_0);
    806   sad += __msa_hadd_u_w(sad0_1, sad0_1);
    807   sad_array[0] = HADD_UW_U32(sad);
    808 
    809   sad = __msa_hadd_u_w(sad1_0, sad1_0);
    810   sad += __msa_hadd_u_w(sad1_1, sad1_1);
    811   sad_array[1] = HADD_UW_U32(sad);
    812 
    813   sad = __msa_hadd_u_w(sad2_0, sad2_0);
    814   sad += __msa_hadd_u_w(sad2_1, sad2_1);
    815   sad_array[2] = HADD_UW_U32(sad);
    816 
    817   sad = __msa_hadd_u_w(sad3_0, sad3_0);
    818   sad += __msa_hadd_u_w(sad3_1, sad3_1);
    819   sad_array[3] = HADD_UW_U32(sad);
    820 }
    821 
    822 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
    823                                   const uint8_t *ref_ptr, int32_t ref_stride,
    824                                   int32_t height, const uint8_t *sec_pred) {
    825   int32_t ht_cnt;
    826   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
    827   v16u8 src = { 0 };
    828   v16u8 ref = { 0 };
    829   v16u8 diff, pred, comp;
    830   v8u16 sad = { 0 };
    831 
    832   for (ht_cnt = (height >> 2); ht_cnt--;) {
    833     LW4(src_ptr, src_stride, src0, src1, src2, src3);
    834     src_ptr += (4 * src_stride);
    835     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
    836     ref_ptr += (4 * ref_stride);
    837     pred = LD_UB(sec_pred);
    838     sec_pred += 16;
    839 
    840     INSERT_W4_UB(src0, src1, src2, src3, src);
    841     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    842 
    843     comp = __msa_aver_u_b(pred, ref);
    844     diff = __msa_asub_u_b(src, comp);
    845     sad += __msa_hadd_u_h(diff, diff);
    846   }
    847 
    848   return HADD_UH_U32(sad);
    849 }
    850 
    851 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
    852                                   const uint8_t *ref, int32_t ref_stride,
    853                                   int32_t height, const uint8_t *sec_pred) {
    854   int32_t ht_cnt;
    855   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
    856   v16u8 diff0, diff1, pred0, pred1;
    857   v8u16 sad = { 0 };
    858 
    859   for (ht_cnt = (height >> 2); ht_cnt--;) {
    860     LD_UB4(src, src_stride, src0, src1, src2, src3);
    861     src += (4 * src_stride);
    862     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
    863     ref += (4 * ref_stride);
    864     LD_UB2(sec_pred, 16, pred0, pred1);
    865     sec_pred += 32;
    866     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
    867                 ref0, ref1);
    868     AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
    869     sad += SAD_UB2_UH(src0, src1, diff0, diff1);
    870   }
    871 
    872   return HADD_UH_U32(sad);
    873 }
    874 
    875 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
    876                                    const uint8_t *ref, int32_t ref_stride,
    877                                    int32_t height, const uint8_t *sec_pred) {
    878   int32_t ht_cnt;
    879   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
    880   v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
    881   v8u16 sad = { 0 };
    882 
    883   for (ht_cnt = (height >> 3); ht_cnt--;) {
    884     LD_UB4(src, src_stride, src0, src1, src2, src3);
    885     src += (4 * src_stride);
    886     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
    887     ref += (4 * ref_stride);
    888     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    889     sec_pred += (4 * 16);
    890     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
    891     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
    892     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
    893     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
    894 
    895     LD_UB4(src, src_stride, src0, src1, src2, src3);
    896     src += (4 * src_stride);
    897     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
    898     ref += (4 * ref_stride);
    899     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    900     sec_pred += (4 * 16);
    901     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
    902     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
    903     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
    904     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
    905   }
    906 
    907   return HADD_UH_U32(sad);
    908 }
    909 
    910 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
    911                                    const uint8_t *ref, int32_t ref_stride,
    912                                    int32_t height, const uint8_t *sec_pred) {
    913   int32_t ht_cnt;
    914   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    915   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
    916   v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
    917   v16u8 comp0, comp1;
    918   v8u16 sad = { 0 };
    919 
    920   for (ht_cnt = (height >> 2); ht_cnt--;) {
    921     LD_UB4(src, src_stride, src0, src2, src4, src6);
    922     LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
    923     src += (4 * src_stride);
    924 
    925     LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
    926     LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
    927     ref += (4 * ref_stride);
    928 
    929     LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
    930     LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
    931     sec_pred += (4 * 32);
    932 
    933     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
    934     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
    935     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
    936     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
    937     AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
    938     sad += SAD_UB2_UH(src4, src5, comp0, comp1);
    939     AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
    940     sad += SAD_UB2_UH(src6, src7, comp0, comp1);
    941   }
    942 
    943   return HADD_UH_U32(sad);
    944 }
    945 
    946 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
    947                                    const uint8_t *ref, int32_t ref_stride,
    948                                    int32_t height, const uint8_t *sec_pred) {
    949   int32_t ht_cnt;
    950   v16u8 src0, src1, src2, src3;
    951   v16u8 ref0, ref1, ref2, ref3;
    952   v16u8 comp0, comp1, comp2, comp3;
    953   v16u8 pred0, pred1, pred2, pred3;
    954   v8u16 sad0 = { 0 };
    955   v8u16 sad1 = { 0 };
    956   v4u32 sad;
    957 
    958   for (ht_cnt = (height >> 2); ht_cnt--;) {
    959     LD_UB4(src, 16, src0, src1, src2, src3);
    960     src += src_stride;
    961     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
    962     ref += ref_stride;
    963     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    964     sec_pred += 64;
    965     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
    966                 comp1, comp2, comp3);
    967     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
    968     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
    969 
    970     LD_UB4(src, 16, src0, src1, src2, src3);
    971     src += src_stride;
    972     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
    973     ref += ref_stride;
    974     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    975     sec_pred += 64;
    976     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
    977                 comp1, comp2, comp3);
    978     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
    979     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
    980 
    981     LD_UB4(src, 16, src0, src1, src2, src3);
    982     src += src_stride;
    983     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
    984     ref += ref_stride;
    985     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    986     sec_pred += 64;
    987     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
    988                 comp1, comp2, comp3);
    989     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
    990     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
    991 
    992     LD_UB4(src, 16, src0, src1, src2, src3);
    993     src += src_stride;
    994     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
    995     ref += ref_stride;
    996     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
    997     sec_pred += 64;
    998     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
    999                 comp1, comp2, comp3);
   1000     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
   1001     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
   1002   }
   1003 
   1004   sad = __msa_hadd_u_w(sad0, sad0);
   1005   sad += __msa_hadd_u_w(sad1, sad1);
   1006 
   1007   return HADD_SW_S32(sad);
   1008 }
   1009 
   1010 #define VPX_SAD_4xHEIGHT_MSA(height)                                         \
   1011   uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
   1012                                    const uint8_t *ref, int32_t ref_stride) { \
   1013     return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
   1014   }
   1015 
   1016 #define VPX_SAD_8xHEIGHT_MSA(height)                                         \
   1017   uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
   1018                                    const uint8_t *ref, int32_t ref_stride) { \
   1019     return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
   1020   }
   1021 
   1022 #define VPX_SAD_16xHEIGHT_MSA(height)                                         \
   1023   uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
   1024                                     const uint8_t *ref, int32_t ref_stride) { \
   1025     return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
   1026   }
   1027 
   1028 #define VPX_SAD_32xHEIGHT_MSA(height)                                         \
   1029   uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
   1030                                     const uint8_t *ref, int32_t ref_stride) { \
   1031     return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
   1032   }
   1033 
   1034 #define VPX_SAD_64xHEIGHT_MSA(height)                                         \
   1035   uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
   1036                                     const uint8_t *ref, int32_t ref_stride) { \
   1037     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   1038   }
   1039 
   1040 #define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
   1041   void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
   1042                                  const uint8_t *ref, int32_t ref_stride, \
   1043                                  uint32_t *sads) {                       \
   1044     sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   1045   }
   1046 
   1047 #define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
   1048   void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
   1049                                  const uint8_t *ref, int32_t ref_stride, \
   1050                                  uint32_t *sads) {                       \
   1051     sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   1052   }
   1053 
   1054 #define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
   1055   void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
   1056                                   const uint8_t *ref, int32_t ref_stride, \
   1057                                   uint32_t *sads) {                       \
   1058     sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   1059   }
   1060 
   1061 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
   1062   void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
   1063                                  const uint8_t *ref, int32_t ref_stride, \
   1064                                  uint32_t *sads) {                       \
   1065     sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   1066   }
   1067 
   1068 #define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
   1069   void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
   1070                                  const uint8_t *ref, int32_t ref_stride, \
   1071                                  uint32_t *sads) {                       \
   1072     sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   1073   }
   1074 
   1075 #define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
   1076   void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
   1077                                   const uint8_t *ref, int32_t ref_stride, \
   1078                                   uint32_t *sads) {                       \
   1079     sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   1080   }
   1081 
   1082 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   1083   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
   1084                                   const uint8_t *const refs[],            \
   1085                                   int32_t ref_stride, uint32_t *sads) {   \
   1086     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   1087   }
   1088 
   1089 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
   1090   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
   1091                                   const uint8_t *const refs[],            \
   1092                                   int32_t ref_stride, uint32_t *sads) {   \
   1093     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   1094   }
   1095 
   1096 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
   1097   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
   1098                                    const uint8_t *const refs[],            \
   1099                                    int32_t ref_stride, uint32_t *sads) {   \
   1100     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   1101   }
   1102 
   1103 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
   1104   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
   1105                                    const uint8_t *const refs[],            \
   1106                                    int32_t ref_stride, uint32_t *sads) {   \
   1107     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   1108   }
   1109 
   1110 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
   1111   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
   1112                                    const uint8_t *const refs[],            \
   1113                                    int32_t ref_stride, uint32_t *sads) {   \
   1114     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   1115   }
   1116 
   1117 #define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
   1118   uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
   1119                                        const uint8_t *ref, int32_t ref_stride, \
   1120                                        const uint8_t *second_pred) {           \
   1121     return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
   1122                              second_pred);                                     \
   1123   }
   1124 
   1125 #define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
   1126   uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
   1127                                        const uint8_t *ref, int32_t ref_stride, \
   1128                                        const uint8_t *second_pred) {           \
   1129     return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
   1130                              second_pred);                                     \
   1131   }
   1132 
   1133 #define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
   1134   uint32_t vpx_sad16x##height##_avg_msa(                                \
   1135       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
   1136       int32_t ref_stride, const uint8_t *second_pred) {                 \
   1137     return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
   1138                               second_pred);                             \
   1139   }
   1140 
   1141 #define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
   1142   uint32_t vpx_sad32x##height##_avg_msa(                                \
   1143       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
   1144       int32_t ref_stride, const uint8_t *second_pred) {                 \
   1145     return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
   1146                               second_pred);                             \
   1147   }
   1148 
   1149 #define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
   1150   uint32_t vpx_sad64x##height##_avg_msa(                                \
   1151       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
   1152       int32_t ref_stride, const uint8_t *second_pred) {                 \
   1153     return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
   1154                               second_pred);                             \
   1155   }
   1156 
   1157 // 64x64
   1158 VPX_SAD_64xHEIGHT_MSA(64);
   1159 VPX_SAD_64xHEIGHTx4D_MSA(64);
   1160 VPX_AVGSAD_64xHEIGHT_MSA(64);
   1161 
   1162 // 64x32
   1163 VPX_SAD_64xHEIGHT_MSA(32);
   1164 VPX_SAD_64xHEIGHTx4D_MSA(32);
   1165 VPX_AVGSAD_64xHEIGHT_MSA(32);
   1166 
   1167 // 32x64
   1168 VPX_SAD_32xHEIGHT_MSA(64);
   1169 VPX_SAD_32xHEIGHTx4D_MSA(64);
   1170 VPX_AVGSAD_32xHEIGHT_MSA(64);
   1171 
   1172 // 32x32
   1173 VPX_SAD_32xHEIGHT_MSA(32);
   1174 VPX_SAD_32xHEIGHTx4D_MSA(32);
   1175 VPX_AVGSAD_32xHEIGHT_MSA(32);
   1176 
   1177 // 32x16
   1178 VPX_SAD_32xHEIGHT_MSA(16);
   1179 VPX_SAD_32xHEIGHTx4D_MSA(16);
   1180 VPX_AVGSAD_32xHEIGHT_MSA(16);
   1181 
   1182 // 16x32
   1183 VPX_SAD_16xHEIGHT_MSA(32);
   1184 VPX_SAD_16xHEIGHTx4D_MSA(32);
   1185 VPX_AVGSAD_16xHEIGHT_MSA(32);
   1186 
   1187 // 16x16
   1188 VPX_SAD_16xHEIGHT_MSA(16);
   1189 VPX_SAD_16xHEIGHTx3_MSA(16);
   1190 VPX_SAD_16xHEIGHTx8_MSA(16);
   1191 VPX_SAD_16xHEIGHTx4D_MSA(16);
   1192 VPX_AVGSAD_16xHEIGHT_MSA(16);
   1193 
   1194 // 16x8
   1195 VPX_SAD_16xHEIGHT_MSA(8);
   1196 VPX_SAD_16xHEIGHTx3_MSA(8);
   1197 VPX_SAD_16xHEIGHTx8_MSA(8);
   1198 VPX_SAD_16xHEIGHTx4D_MSA(8);
   1199 VPX_AVGSAD_16xHEIGHT_MSA(8);
   1200 
   1201 // 8x16
   1202 VPX_SAD_8xHEIGHT_MSA(16);
   1203 VPX_SAD_8xHEIGHTx3_MSA(16);
   1204 VPX_SAD_8xHEIGHTx8_MSA(16);
   1205 VPX_SAD_8xHEIGHTx4D_MSA(16);
   1206 VPX_AVGSAD_8xHEIGHT_MSA(16);
   1207 
   1208 // 8x8
   1209 VPX_SAD_8xHEIGHT_MSA(8);
   1210 VPX_SAD_8xHEIGHTx3_MSA(8);
   1211 VPX_SAD_8xHEIGHTx8_MSA(8);
   1212 VPX_SAD_8xHEIGHTx4D_MSA(8);
   1213 VPX_AVGSAD_8xHEIGHT_MSA(8);
   1214 
   1215 // 8x4
   1216 VPX_SAD_8xHEIGHT_MSA(4);
   1217 VPX_SAD_8xHEIGHTx4D_MSA(4);
   1218 VPX_AVGSAD_8xHEIGHT_MSA(4);
   1219 
   1220 // 4x8
   1221 VPX_SAD_4xHEIGHT_MSA(8);
   1222 VPX_SAD_4xHEIGHTx4D_MSA(8);
   1223 VPX_AVGSAD_4xHEIGHT_MSA(8);
   1224 
   1225 // 4x4
   1226 VPX_SAD_4xHEIGHT_MSA(4);
   1227 VPX_SAD_4xHEIGHTx3_MSA(4);
   1228 VPX_SAD_4xHEIGHTx8_MSA(4);
   1229 VPX_SAD_4xHEIGHTx4D_MSA(4);
   1230 VPX_AVGSAD_4xHEIGHT_MSA(4);
   1231