Home | History | Annotate | Download | only in msa
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp8_rtcd.h"
     12 #include "vpx_ports/mem.h"
     13 #include "vp8/common/filter.h"
     14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
     15 
     16 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = {
     17   { 0, -6, 123, 12, -1, 0, 0, 0 },
     18   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
     19   { 0, -9, 93, 50, -6, 0, 0, 0 },
     20   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
     21   { 0, -6, 50, 93, -9, 0, 0, 0 },
     22   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
     23   { 0, -1, 12, 123, -6, 0, 0, 0 },
     24 };
     25 
     26 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     27   /* 8 width cases */
     28   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
     29   /* 4 width cases */
     30   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
     31   /* 4 width cases */
     32   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
     33 };
     34 
     35 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
     36                         filt_h2)                                           \
     37   ({                                                                       \
     38     v16i8 vec0_m, vec1_m, vec2_m;                                          \
     39     v8i16 hz_out_m;                                                        \
     40                                                                            \
     41     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
     42                vec0_m, vec1_m, vec2_m);                                    \
     43     hz_out_m =                                                             \
     44         DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);   \
     45                                                                            \
     46     hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
     47     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                 \
     48                                                                            \
     49     hz_out_m;                                                              \
     50   })
     51 
     52 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
     53                                    mask2, filt0, filt1, filt2, out0, out1) \
     54   {                                                                        \
     55     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                  \
     56                                                                            \
     57     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
     58     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                 \
     59     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
     60     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                \
     61     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
     62     DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                \
     63   }
     64 
     65 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
     66                                    mask2, filt0, filt1, filt2, out0, out1,   \
     67                                    out2, out3)                               \
     68   {                                                                          \
     69     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
     70                                                                              \
     71     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
     72     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
     73     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
     74                 out0, out1, out2, out3);                                     \
     75     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
     76     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
     77     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);        \
     78     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);        \
     79     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
     80                  out0, out1, out2, out3);                                    \
     81     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
     82                  out0, out1, out2, out3);                                    \
     83   }
     84 
     85 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)        \
     86   ({                                                         \
     87     v8i16 tmp0;                                              \
     88                                                              \
     89     tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);        \
     90     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
     91                                                              \
     92     tmp0;                                                    \
     93   })
     94 
     95 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)   \
     96   ({                                                                  \
     97     v16i8 vec0_m, vec1_m;                                             \
     98     v8i16 hz_out_m;                                                   \
     99                                                                       \
    100     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
    101     hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
    102                                                                       \
    103     hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);             \
    104     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                            \
    105                                                                       \
    106     hz_out_m;                                                         \
    107   })
    108 
    109 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
    110                                    filt0, filt1, out0, out1)             \
    111   {                                                                      \
    112     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
    113                                                                          \
    114     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);    \
    115     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);               \
    116     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);    \
    117     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);              \
    118   }
    119 
    120 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
    121                                    filt0, filt1, out0, out1, out2, out3)     \
    122   {                                                                          \
    123     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                    \
    124                                                                              \
    125     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
    126     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
    127     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
    128                 out0, out1, out2, out3);                                     \
    129     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
    130     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
    131     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
    132                  out0, out1, out2, out3);                                    \
    133   }
    134 
    135 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    136                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    137                                  const int8_t *filter) {
    138   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
    139   v16u8 mask0, mask1, mask2, out;
    140   v8i16 filt, out0, out1;
    141 
    142   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
    143   src -= 2;
    144 
    145   filt = LD_SH(filter);
    146   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    147 
    148   mask1 = mask0 + 2;
    149   mask2 = mask0 + 4;
    150 
    151   LD_SB4(src, src_stride, src0, src1, src2, src3);
    152   XORI_B4_128_SB(src0, src1, src2, src3);
    153   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
    154                              filt1, filt2, out0, out1);
    155   SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
    156   SAT_SH2_SH(out0, out1, 7);
    157   out = PCKEV_XORI128_UB(out0, out1);
    158   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    159 }
    160 
    161 static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
    162                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    163                                  const int8_t *filter) {
    164   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
    165   v16u8 mask0, mask1, mask2, out;
    166   v8i16 filt, out0, out1, out2, out3;
    167 
    168   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
    169   src -= 2;
    170 
    171   filt = LD_SH(filter);
    172   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    173 
    174   mask1 = mask0 + 2;
    175   mask2 = mask0 + 4;
    176 
    177   LD_SB4(src, src_stride, src0, src1, src2, src3);
    178   XORI_B4_128_SB(src0, src1, src2, src3);
    179   src += (4 * src_stride);
    180   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
    181                              filt1, filt2, out0, out1);
    182   LD_SB4(src, src_stride, src0, src1, src2, src3);
    183   XORI_B4_128_SB(src0, src1, src2, src3);
    184   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
    185                              filt1, filt2, out2, out3);
    186   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    187   SAT_SH4_SH(out0, out1, out2, out3, 7);
    188   out = PCKEV_XORI128_UB(out0, out1);
    189   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    190   dst += (4 * dst_stride);
    191   out = PCKEV_XORI128_UB(out2, out3);
    192   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    193 }
    194 
    195 static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    196                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    197                                 const int8_t *filter, int32_t height) {
    198   if (4 == height) {
    199     common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
    200   } else if (8 == height) {
    201     common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
    202   }
    203 }
    204 
    205 static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    206                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    207                                 const int8_t *filter, int32_t height) {
    208   uint32_t loop_cnt;
    209   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
    210   v16u8 mask0, mask1, mask2, tmp0, tmp1;
    211   v8i16 filt, out0, out1, out2, out3;
    212 
    213   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
    214   src -= 2;
    215 
    216   filt = LD_SH(filter);
    217   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    218 
    219   mask1 = mask0 + 2;
    220   mask2 = mask0 + 4;
    221 
    222   LD_SB4(src, src_stride, src0, src1, src2, src3);
    223   XORI_B4_128_SB(src0, src1, src2, src3);
    224   src += (4 * src_stride);
    225   HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
    226                              filt1, filt2, out0, out1, out2, out3);
    227   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    228   SAT_SH4_SH(out0, out1, out2, out3, 7);
    229   tmp0 = PCKEV_XORI128_UB(out0, out1);
    230   tmp1 = PCKEV_XORI128_UB(out2, out3);
    231   ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    232   dst += (4 * dst_stride);
    233 
    234   for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
    235     LD_SB4(src, src_stride, src0, src1, src2, src3);
    236     XORI_B4_128_SB(src0, src1, src2, src3);
    237     src += (4 * src_stride);
    238     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    239                                filt0, filt1, filt2, out0, out1, out2, out3);
    240     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    241     SAT_SH4_SH(out0, out1, out2, out3, 7);
    242     tmp0 = PCKEV_XORI128_UB(out0, out1);
    243     tmp1 = PCKEV_XORI128_UB(out2, out3);
    244     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    245     dst += (4 * dst_stride);
    246   }
    247 }
    248 
    249 static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    250                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    251                                  const int8_t *filter, int32_t height) {
    252   uint32_t loop_cnt;
    253   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
    254   v16u8 mask0, mask1, mask2, out;
    255   v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
    256 
    257   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
    258   src -= 2;
    259 
    260   filt = LD_SH(filter);
    261   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    262 
    263   mask1 = mask0 + 2;
    264   mask2 = mask0 + 4;
    265 
    266   for (loop_cnt = (height >> 2); loop_cnt--;) {
    267     LD_SB4(src, src_stride, src0, src2, src4, src6);
    268     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    269     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
    270     src += (4 * src_stride);
    271 
    272     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    273                                filt0, filt1, filt2, out0, out1, out2, out3);
    274     HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
    275                                filt0, filt1, filt2, out4, out5, out6, out7);
    276     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    277     SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
    278     SAT_SH4_SH(out0, out1, out2, out3, 7);
    279     SAT_SH4_SH(out4, out5, out6, out7, 7);
    280     out = PCKEV_XORI128_UB(out0, out1);
    281     ST_UB(out, dst);
    282     dst += dst_stride;
    283     out = PCKEV_XORI128_UB(out2, out3);
    284     ST_UB(out, dst);
    285     dst += dst_stride;
    286     out = PCKEV_XORI128_UB(out4, out5);
    287     ST_UB(out, dst);
    288     dst += dst_stride;
    289     out = PCKEV_XORI128_UB(out6, out7);
    290     ST_UB(out, dst);
    291     dst += dst_stride;
    292   }
    293 }
    294 
    295 static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    296                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    297                                 const int8_t *filter, int32_t height) {
    298   uint32_t loop_cnt;
    299   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    300   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
    301   v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
    302   v16u8 out;
    303   v8i16 filt, out10, out32;
    304 
    305   src -= (2 * src_stride);
    306 
    307   filt = LD_SH(filter);
    308   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    309 
    310   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    311   src += (5 * src_stride);
    312 
    313   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
    314              src32_r, src43_r);
    315   ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
    316   XORI_B2_128_SB(src2110, src4332);
    317 
    318   for (loop_cnt = (height >> 2); loop_cnt--;) {
    319     LD_SB4(src, src_stride, src5, src6, src7, src8);
    320     src += (4 * src_stride);
    321 
    322     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
    323                src76_r, src87_r);
    324     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
    325     XORI_B2_128_SB(src6554, src8776);
    326     out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
    327     out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
    328     SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
    329     SAT_SH2_SH(out10, out32, 7);
    330     out = PCKEV_XORI128_UB(out10, out32);
    331     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    332     dst += (4 * dst_stride);
    333 
    334     src2110 = src6554;
    335     src4332 = src8776;
    336     src4 = src8;
    337   }
    338 }
    339 
    340 static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    341                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    342                                 const int8_t *filter, int32_t height) {
    343   uint32_t loop_cnt;
    344   v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
    345   v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
    346   v16i8 src109_r, filt0, filt1, filt2;
    347   v16u8 tmp0, tmp1;
    348   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
    349 
    350   src -= (2 * src_stride);
    351 
    352   filt = LD_SH(filter);
    353   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    354 
    355   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    356   src += (5 * src_stride);
    357 
    358   XORI_B5_128_SB(src0, src1, src2, src3, src4);
    359   ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
    360              src21_r, src43_r);
    361 
    362   for (loop_cnt = (height >> 2); loop_cnt--;) {
    363     LD_SB4(src, src_stride, src7, src8, src9, src10);
    364     XORI_B4_128_SB(src7, src8, src9, src10);
    365     src += (4 * src_stride);
    366 
    367     ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
    368                src87_r, src98_r, src109_r);
    369     out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
    370     out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
    371     out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
    372     out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
    373     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
    374     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
    375     tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
    376     tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
    377     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    378     dst += (4 * dst_stride);
    379 
    380     src10_r = src76_r;
    381     src32_r = src98_r;
    382     src21_r = src87_r;
    383     src43_r = src109_r;
    384     src4 = src10;
    385   }
    386 }
    387 
    388 static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    389                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    390                                  const int8_t *filter, int32_t height) {
    391   uint32_t loop_cnt;
    392   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    393   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
    394   v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
    395   v16i8 src65_l, src87_l, filt0, filt1, filt2;
    396   v16u8 tmp0, tmp1, tmp2, tmp3;
    397   v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
    398 
    399   src -= (2 * src_stride);
    400 
    401   filt = LD_SH(filter);
    402   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
    403 
    404   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    405   src += (5 * src_stride);
    406 
    407   XORI_B5_128_SB(src0, src1, src2, src3, src4);
    408   ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r,
    409              src43_r, src21_r);
    410   ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l,
    411              src43_l, src21_l);
    412 
    413   for (loop_cnt = (height >> 2); loop_cnt--;) {
    414     LD_SB4(src, src_stride, src5, src6, src7, src8);
    415     src += (4 * src_stride);
    416 
    417     XORI_B4_128_SB(src5, src6, src7, src8);
    418     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
    419                src76_r, src87_r);
    420     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
    421                src76_l, src87_l);
    422     out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
    423     out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
    424     out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
    425     out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
    426     out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
    427     out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
    428     out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
    429     out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
    430     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
    431     SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
    432     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
    433     SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
    434     PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
    435                 tmp0, tmp1, tmp2, tmp3);
    436     XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
    437     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
    438     dst += (4 * dst_stride);
    439 
    440     src10_r = src54_r;
    441     src32_r = src76_r;
    442     src21_r = src65_r;
    443     src43_r = src87_r;
    444     src10_l = src54_l;
    445     src32_l = src76_l;
    446     src21_l = src65_l;
    447     src43_l = src87_l;
    448     src4 = src8;
    449   }
    450 }
    451 
    452 static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    453                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    454                                      const int8_t *filter_horiz,
    455                                      const int8_t *filter_vert,
    456                                      int32_t height) {
    457   uint32_t loop_cnt;
    458   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    459   v16i8 filt_hz0, filt_hz1, filt_hz2;
    460   v16u8 mask0, mask1, mask2, out;
    461   v8i16 tmp0, tmp1;
    462   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
    463   v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
    464 
    465   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
    466   src -= (2 + 2 * src_stride);
    467 
    468   filt = LD_SH(filter_horiz);
    469   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
    470   filt = LD_SH(filter_vert);
    471   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
    472 
    473   mask1 = mask0 + 2;
    474   mask2 = mask0 + 4;
    475 
    476   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    477   src += (5 * src_stride);
    478 
    479   XORI_B5_128_SB(src0, src1, src2, src3, src4);
    480   hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
    481                             filt_hz2);
    482   hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
    483                             filt_hz2);
    484   hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
    485   hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
    486                             filt_hz2);
    487   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
    488 
    489   for (loop_cnt = (height >> 2); loop_cnt--;) {
    490     LD_SB2(src, src_stride, src5, src6);
    491     src += (2 * src_stride);
    492 
    493     XORI_B2_128_SB(src5, src6);
    494     hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
    495                               filt_hz1, filt_hz2);
    496     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
    497 
    498     LD_SB2(src, src_stride, src7, src8);
    499     src += (2 * src_stride);
    500 
    501     XORI_B2_128_SB(src7, src8);
    502     hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
    503                               filt_hz1, filt_hz2);
    504     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
    505 
    506     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
    507     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
    508 
    509     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
    510     tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
    511 
    512     SRARI_H2_SH(tmp0, tmp1, 7);
    513     SAT_SH2_SH(tmp0, tmp1, 7);
    514     out = PCKEV_XORI128_UB(tmp0, tmp1);
    515     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    516     dst += (4 * dst_stride);
    517 
    518     hz_out3 = hz_out7;
    519     out0 = out2;
    520     out1 = out3;
    521   }
    522 }
    523 
    524 static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    525                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    526                                      const int8_t *filter_horiz,
    527                                      const int8_t *filter_vert,
    528                                      int32_t height) {
    529   uint32_t loop_cnt;
    530   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    531   v16i8 filt_hz0, filt_hz1, filt_hz2;
    532   v16u8 mask0, mask1, mask2, vec0, vec1;
    533   v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
    534   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
    535   v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
    536   v8i16 tmp0, tmp1, tmp2, tmp3;
    537 
    538   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
    539   src -= (2 + 2 * src_stride);
    540 
    541   filt = LD_SH(filter_horiz);
    542   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
    543 
    544   mask1 = mask0 + 2;
    545   mask2 = mask0 + 4;
    546 
    547   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    548   src += (5 * src_stride);
    549 
    550   XORI_B5_128_SB(src0, src1, src2, src3, src4);
    551   hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
    552                             filt_hz2);
    553   hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
    554                             filt_hz2);
    555   hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
    556                             filt_hz2);
    557   hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
    558                             filt_hz2);
    559   hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
    560                             filt_hz2);
    561 
    562   filt = LD_SH(filter_vert);
    563   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
    564 
    565   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
    566   ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
    567 
    568   for (loop_cnt = (height >> 2); loop_cnt--;) {
    569     LD_SB4(src, src_stride, src5, src6, src7, src8);
    570     src += (4 * src_stride);
    571 
    572     XORI_B4_128_SB(src5, src6, src7, src8);
    573     hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
    574                               filt_hz1, filt_hz2);
    575     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
    576     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
    577 
    578     hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
    579                               filt_hz1, filt_hz2);
    580     out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
    581     tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
    582 
    583     hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
    584                               filt_hz1, filt_hz2);
    585     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
    586     tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
    587 
    588     hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
    589                               filt_hz1, filt_hz2);
    590     out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
    591     tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
    592 
    593     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
    594     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
    595     vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
    596     vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
    597     ST8x4_UB(vec0, vec1, dst, dst_stride);
    598     dst += (4 * dst_stride);
    599 
    600     hz_out4 = hz_out8;
    601     out0 = out2;
    602     out1 = out7;
    603     out3 = out5;
    604     out4 = out6;
    605   }
    606 }
    607 
    608 static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    609                                       uint8_t *RESTRICT dst, int32_t dst_stride,
    610                                       const int8_t *filter_horiz,
    611                                       const int8_t *filter_vert,
    612                                       int32_t height) {
    613   int32_t multiple8_cnt;
    614   for (multiple8_cnt = 2; multiple8_cnt--;) {
    615     common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
    616                              filter_vert, height);
    617     src += 8;
    618     dst += 8;
    619   }
    620 }
    621 
    622 static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    623                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    624                                  const int8_t *filter) {
    625   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
    626   v8i16 filt, out0, out1;
    627   v16u8 out;
    628 
    629   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
    630   src -= 1;
    631 
    632   filt = LD_SH(filter);
    633   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    634 
    635   mask1 = mask0 + 2;
    636 
    637   LD_SB4(src, src_stride, src0, src1, src2, src3);
    638   XORI_B4_128_SB(src0, src1, src2, src3);
    639   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
    640                              out0, out1);
    641   SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
    642   SAT_SH2_SH(out0, out1, 7);
    643   out = PCKEV_XORI128_UB(out0, out1);
    644   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    645 }
    646 
    647 static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
    648                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    649                                  const int8_t *filter) {
    650   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
    651   v16u8 out;
    652   v8i16 filt, out0, out1, out2, out3;
    653 
    654   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
    655   src -= 1;
    656 
    657   filt = LD_SH(filter);
    658   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    659 
    660   mask1 = mask0 + 2;
    661 
    662   LD_SB4(src, src_stride, src0, src1, src2, src3);
    663   src += (4 * src_stride);
    664 
    665   XORI_B4_128_SB(src0, src1, src2, src3);
    666   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
    667                              out0, out1);
    668   LD_SB4(src, src_stride, src0, src1, src2, src3);
    669   XORI_B4_128_SB(src0, src1, src2, src3);
    670   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
    671                              out2, out3);
    672   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    673   SAT_SH4_SH(out0, out1, out2, out3, 7);
    674   out = PCKEV_XORI128_UB(out0, out1);
    675   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    676   dst += (4 * dst_stride);
    677   out = PCKEV_XORI128_UB(out2, out3);
    678   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    679 }
    680 
    681 static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    682                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    683                                 const int8_t *filter, int32_t height) {
    684   if (4 == height) {
    685     common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
    686   } else if (8 == height) {
    687     common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
    688   }
    689 }
    690 
    691 static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    692                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    693                                 const int8_t *filter, int32_t height) {
    694   uint32_t loop_cnt;
    695   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
    696   v16u8 tmp0, tmp1;
    697   v8i16 filt, out0, out1, out2, out3;
    698 
    699   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
    700   src -= 1;
    701 
    702   filt = LD_SH(filter);
    703   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    704 
    705   mask1 = mask0 + 2;
    706 
    707   for (loop_cnt = (height >> 2); loop_cnt--;) {
    708     LD_SB4(src, src_stride, src0, src1, src2, src3);
    709     src += (4 * src_stride);
    710 
    711     XORI_B4_128_SB(src0, src1, src2, src3);
    712     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
    713                                filt1, out0, out1, out2, out3);
    714     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    715     SAT_SH4_SH(out0, out1, out2, out3, 7);
    716     tmp0 = PCKEV_XORI128_UB(out0, out1);
    717     tmp1 = PCKEV_XORI128_UB(out2, out3);
    718     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    719     dst += (4 * dst_stride);
    720   }
    721 }
    722 
    723 static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    724                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    725                                  const int8_t *filter, int32_t height) {
    726   uint32_t loop_cnt;
    727   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
    728   v16i8 filt0, filt1, mask0, mask1;
    729   v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
    730   v16u8 out;
    731 
    732   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
    733   src -= 1;
    734 
    735   filt = LD_SH(filter);
    736   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    737 
    738   mask1 = mask0 + 2;
    739 
    740   for (loop_cnt = (height >> 2); loop_cnt--;) {
    741     LD_SB4(src, src_stride, src0, src2, src4, src6);
    742     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    743     src += (4 * src_stride);
    744 
    745     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
    746     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
    747                                filt1, out0, out1, out2, out3);
    748     HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
    749                                filt1, out4, out5, out6, out7);
    750     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    751     SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
    752     SAT_SH4_SH(out0, out1, out2, out3, 7);
    753     SAT_SH4_SH(out4, out5, out6, out7, 7);
    754     out = PCKEV_XORI128_UB(out0, out1);
    755     ST_UB(out, dst);
    756     dst += dst_stride;
    757     out = PCKEV_XORI128_UB(out2, out3);
    758     ST_UB(out, dst);
    759     dst += dst_stride;
    760     out = PCKEV_XORI128_UB(out4, out5);
    761     ST_UB(out, dst);
    762     dst += dst_stride;
    763     out = PCKEV_XORI128_UB(out6, out7);
    764     ST_UB(out, dst);
    765     dst += dst_stride;
    766   }
    767 }
    768 
    769 static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    770                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    771                                 const int8_t *filter, int32_t height) {
    772   uint32_t loop_cnt;
    773   v16i8 src0, src1, src2, src3, src4, src5;
    774   v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
    775   v16i8 src2110, src4332, filt0, filt1;
    776   v8i16 filt, out10, out32;
    777   v16u8 out;
    778 
    779   src -= src_stride;
    780 
    781   filt = LD_SH(filter);
    782   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    783 
    784   LD_SB3(src, src_stride, src0, src1, src2);
    785   src += (3 * src_stride);
    786 
    787   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
    788 
    789   src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
    790   src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
    791 
    792   for (loop_cnt = (height >> 2); loop_cnt--;) {
    793     LD_SB3(src, src_stride, src3, src4, src5);
    794     src += (3 * src_stride);
    795     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
    796     src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
    797     src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
    798     out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
    799 
    800     src2 = LD_SB(src);
    801     src += (src_stride);
    802     ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
    803     src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
    804     src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
    805     out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
    806     SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
    807     SAT_SH2_SH(out10, out32, 7);
    808     out = PCKEV_XORI128_UB(out10, out32);
    809     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    810     dst += (4 * dst_stride);
    811   }
    812 }
    813 
    814 static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    815                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    816                                 const int8_t *filter, int32_t height) {
    817   uint32_t loop_cnt;
    818   v16i8 src0, src1, src2, src7, src8, src9, src10;
    819   v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
    820   v16u8 tmp0, tmp1;
    821   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
    822 
    823   src -= src_stride;
    824 
    825   filt = LD_SH(filter);
    826   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    827 
    828   LD_SB3(src, src_stride, src0, src1, src2);
    829   src += (3 * src_stride);
    830 
    831   XORI_B3_128_SB(src0, src1, src2);
    832   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
    833 
    834   for (loop_cnt = (height >> 2); loop_cnt--;) {
    835     LD_SB4(src, src_stride, src7, src8, src9, src10);
    836     src += (4 * src_stride);
    837 
    838     XORI_B4_128_SB(src7, src8, src9, src10);
    839     ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r,
    840                src87_r, src98_r, src109_r);
    841     out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
    842     out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
    843     out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
    844     out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
    845     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
    846     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
    847     tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
    848     tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
    849     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    850     dst += (4 * dst_stride);
    851 
    852     src10_r = src98_r;
    853     src21_r = src109_r;
    854     src2 = src10;
    855   }
    856 }
    857 
    858 static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    859                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    860                                  const int8_t *filter, int32_t height) {
    861   uint32_t loop_cnt;
    862   v16i8 src0, src1, src2, src3, src4, src5, src6;
    863   v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
    864   v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
    865   v16u8 tmp0, tmp1, tmp2, tmp3;
    866   v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
    867 
    868   src -= src_stride;
    869 
    870   filt = LD_SH(filter);
    871   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
    872 
    873   LD_SB3(src, src_stride, src0, src1, src2);
    874   src += (3 * src_stride);
    875 
    876   XORI_B3_128_SB(src0, src1, src2);
    877   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
    878   ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
    879 
    880   for (loop_cnt = (height >> 2); loop_cnt--;) {
    881     LD_SB4(src, src_stride, src3, src4, src5, src6);
    882     src += (4 * src_stride);
    883 
    884     XORI_B4_128_SB(src3, src4, src5, src6);
    885     ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
    886                src54_r, src65_r);
    887     ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l,
    888                src54_l, src65_l);
    889     out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
    890     out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
    891     out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
    892     out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
    893     out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
    894     out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
    895     out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
    896     out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
    897     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
    898     SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
    899     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
    900     SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
    901     PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
    902                 tmp0, tmp1, tmp2, tmp3);
    903     XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
    904     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
    905     dst += (4 * dst_stride);
    906 
    907     src10_r = src54_r;
    908     src21_r = src65_r;
    909     src10_l = src54_l;
    910     src21_l = src65_l;
    911     src2 = src6;
    912   }
    913 }
    914 
    915 static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    916                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    917                                      const int8_t *filter_horiz,
    918                                      const int8_t *filter_vert,
    919                                      int32_t height) {
    920   uint32_t loop_cnt;
    921   v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
    922   v16u8 mask0, mask1, out;
    923   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
    924   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
    925 
    926   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
    927   src -= (1 + 1 * src_stride);
    928 
    929   filt = LD_SH(filter_horiz);
    930   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
    931 
    932   mask1 = mask0 + 2;
    933 
    934   LD_SB3(src, src_stride, src0, src1, src2);
    935   src += (3 * src_stride);
    936 
    937   XORI_B3_128_SB(src0, src1, src2);
    938   hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
    939   hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
    940   vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    941 
    942   filt = LD_SH(filter_vert);
    943   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
    944 
    945   for (loop_cnt = (height >> 2); loop_cnt--;) {
    946     LD_SB4(src, src_stride, src3, src4, src5, src6);
    947     src += (4 * src_stride);
    948 
    949     XORI_B2_128_SB(src3, src4);
    950     hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
    951     hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
    952     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
    953     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
    954 
    955     XORI_B2_128_SB(src5, src6);
    956     hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
    957     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
    958     vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
    959     tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
    960 
    961     SRARI_H2_SH(tmp0, tmp1, 7);
    962     SAT_SH2_SH(tmp0, tmp1, 7);
    963     out = PCKEV_XORI128_UB(tmp0, tmp1);
    964     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    965     dst += (4 * dst_stride);
    966 
    967     hz_out1 = hz_out5;
    968     vec0 = vec2;
    969   }
    970 }
    971 
    972 static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    973                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    974                                      const int8_t *filter_horiz,
    975                                      const int8_t *filter_vert,
    976                                      int32_t height) {
    977   uint32_t loop_cnt;
    978   v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
    979   v16u8 mask0, mask1, out0, out1;
    980   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
    981   v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
    982   v8i16 vec0, vec1, vec2, vec3, vec4;
    983 
    984   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
    985   src -= (1 + 1 * src_stride);
    986 
    987   filt = LD_SH(filter_horiz);
    988   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
    989 
    990   mask1 = mask0 + 2;
    991 
    992   LD_SB3(src, src_stride, src0, src1, src2);
    993   src += (3 * src_stride);
    994 
    995   XORI_B3_128_SB(src0, src1, src2);
    996   hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
    997   hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
    998   hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
    999   ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
   1000 
   1001   filt = LD_SH(filter_vert);
   1002   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
   1003 
   1004   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1005     LD_SB4(src, src_stride, src3, src4, src5, src6);
   1006     src += (4 * src_stride);
   1007 
   1008     XORI_B4_128_SB(src3, src4, src5, src6);
   1009     hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
   1010     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
   1011     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
   1012 
   1013     hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
   1014     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
   1015     tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
   1016 
   1017     hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
   1018     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
   1019     tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
   1020 
   1021     hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
   1022     ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
   1023     tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
   1024 
   1025     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
   1026     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
   1027     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
   1028     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
   1029     ST8x4_UB(out0, out1, dst, dst_stride);
   1030     dst += (4 * dst_stride);
   1031 
   1032     vec0 = vec4;
   1033     vec2 = vec1;
   1034   }
   1035 }
   1036 
   1037 static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1038                                       uint8_t *RESTRICT dst, int32_t dst_stride,
   1039                                       const int8_t *filter_horiz,
   1040                                       const int8_t *filter_vert,
   1041                                       int32_t height) {
   1042   int32_t multiple8_cnt;
   1043   for (multiple8_cnt = 2; multiple8_cnt--;) {
   1044     common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
   1045                              filter_vert, height);
   1046     src += 8;
   1047     dst += 8;
   1048   }
   1049 }
   1050 
   1051 static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1052                                      uint8_t *RESTRICT dst, int32_t dst_stride,
   1053                                      const int8_t *filter_horiz,
   1054                                      const int8_t *filter_vert,
   1055                                      int32_t height) {
   1056   uint32_t loop_cnt;
   1057   v16i8 src0, src1, src2, src3, src4, src5, src6;
   1058   v16i8 filt_hz0, filt_hz1, filt_hz2;
   1059   v16u8 res0, res1, mask0, mask1, mask2;
   1060   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
   1061   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
   1062 
   1063   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
   1064   src -= (2 + 1 * src_stride);
   1065 
   1066   filt = LD_SH(filter_horiz);
   1067   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
   1068 
   1069   mask1 = mask0 + 2;
   1070   mask2 = mask0 + 4;
   1071 
   1072   LD_SB3(src, src_stride, src0, src1, src2);
   1073   src += (3 * src_stride);
   1074 
   1075   XORI_B3_128_SB(src0, src1, src2);
   1076   hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
   1077                             filt_hz2);
   1078   hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
   1079                             filt_hz2);
   1080   vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
   1081 
   1082   filt = LD_SH(filter_vert);
   1083   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
   1084 
   1085   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1086     LD_SB4(src, src_stride, src3, src4, src5, src6);
   1087     src += (4 * src_stride);
   1088 
   1089     XORI_B4_128_SB(src3, src4, src5, src6);
   1090     hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
   1091                               filt_hz1, filt_hz2);
   1092     hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
   1093     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
   1094     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
   1095 
   1096     hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
   1097                               filt_hz1, filt_hz2);
   1098     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
   1099     vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
   1100     tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
   1101 
   1102     SRARI_H2_SH(tmp0, tmp1, 7);
   1103     SAT_SH2_SH(tmp0, tmp1, 7);
   1104     PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
   1105     XORI_B2_128_UB(res0, res1);
   1106     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
   1107     dst += (4 * dst_stride);
   1108 
   1109     hz_out1 = hz_out5;
   1110     vec0 = vec2;
   1111   }
   1112 }
   1113 
   1114 static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1115                                      uint8_t *RESTRICT dst, int32_t dst_stride,
   1116                                      const int8_t *filter_horiz,
   1117                                      const int8_t *filter_vert,
   1118                                      int32_t height) {
   1119   uint32_t loop_cnt;
   1120   v16i8 src0, src1, src2, src3, src4, src5, src6;
   1121   v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
   1122   v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
   1123   v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
   1124   v16u8 out0, out1;
   1125 
   1126   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
   1127   src -= (2 + src_stride);
   1128 
   1129   filt = LD_SH(filter_horiz);
   1130   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
   1131 
   1132   mask1 = mask0 + 2;
   1133   mask2 = mask0 + 4;
   1134 
   1135   LD_SB3(src, src_stride, src0, src1, src2);
   1136   src += (3 * src_stride);
   1137 
   1138   XORI_B3_128_SB(src0, src1, src2);
   1139   hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
   1140                             filt_hz2);
   1141   hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
   1142                             filt_hz2);
   1143   hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
   1144                             filt_hz2);
   1145   ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
   1146 
   1147   filt = LD_SH(filter_vert);
   1148   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
   1149 
   1150   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1151     LD_SB4(src, src_stride, src3, src4, src5, src6);
   1152     src += (4 * src_stride);
   1153 
   1154     XORI_B4_128_SB(src3, src4, src5, src6);
   1155 
   1156     hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
   1157                               filt_hz1, filt_hz2);
   1158     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
   1159     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
   1160 
   1161     hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
   1162                               filt_hz1, filt_hz2);
   1163     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
   1164     tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
   1165 
   1166     hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
   1167                               filt_hz1, filt_hz2);
   1168     vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
   1169     tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
   1170 
   1171     hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
   1172                               filt_hz1, filt_hz2);
   1173     ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
   1174     tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
   1175 
   1176     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
   1177     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
   1178     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
   1179     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
   1180     ST8x4_UB(out0, out1, dst, dst_stride);
   1181     dst += (4 * dst_stride);
   1182   }
   1183 }
   1184 
   1185 static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1186                                       uint8_t *RESTRICT dst, int32_t dst_stride,
   1187                                       const int8_t *filter_horiz,
   1188                                       const int8_t *filter_vert,
   1189                                       int32_t height) {
   1190   int32_t multiple8_cnt;
   1191   for (multiple8_cnt = 2; multiple8_cnt--;) {
   1192     common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
   1193                              filter_vert, height);
   1194     src += 8;
   1195     dst += 8;
   1196   }
   1197 }
   1198 
   1199 static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1200                                      uint8_t *RESTRICT dst, int32_t dst_stride,
   1201                                      const int8_t *filter_horiz,
   1202                                      const int8_t *filter_vert,
   1203                                      int32_t height) {
   1204   uint32_t loop_cnt;
   1205   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
   1206   v16i8 filt_hz0, filt_hz1, mask0, mask1;
   1207   v16u8 out;
   1208   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   1209   v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
   1210   v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
   1211 
   1212   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
   1213 
   1214   src -= (1 + 2 * src_stride);
   1215 
   1216   filt = LD_SH(filter_horiz);
   1217   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
   1218 
   1219   mask1 = mask0 + 2;
   1220 
   1221   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
   1222   src += (5 * src_stride);
   1223 
   1224   XORI_B5_128_SB(src0, src1, src2, src3, src4);
   1225   hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
   1226   hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
   1227   hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
   1228   hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
   1229   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
   1230 
   1231   filt = LD_SH(filter_vert);
   1232   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
   1233 
   1234   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1235     LD_SB4(src, src_stride, src5, src6, src7, src8);
   1236     XORI_B4_128_SB(src5, src6, src7, src8);
   1237     src += (4 * src_stride);
   1238 
   1239     hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
   1240     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
   1241     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
   1242     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
   1243 
   1244     hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
   1245     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
   1246     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
   1247     tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
   1248 
   1249     SRARI_H2_SH(tmp0, tmp1, 7);
   1250     SAT_SH2_SH(tmp0, tmp1, 7);
   1251     out = PCKEV_XORI128_UB(tmp0, tmp1);
   1252     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
   1253     dst += (4 * dst_stride);
   1254 
   1255     hz_out3 = hz_out7;
   1256     out0 = out2;
   1257     out1 = out3;
   1258   }
   1259 }
   1260 
   1261 static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1262                                      uint8_t *RESTRICT dst, int32_t dst_stride,
   1263                                      const int8_t *filter_horiz,
   1264                                      const int8_t *filter_vert,
   1265                                      int32_t height) {
   1266   uint32_t loop_cnt;
   1267   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
   1268   v16i8 filt_hz0, filt_hz1, mask0, mask1;
   1269   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
   1270   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   1271   v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
   1272   v16u8 vec0, vec1;
   1273 
   1274   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
   1275   src -= (1 + 2 * src_stride);
   1276 
   1277   filt = LD_SH(filter_horiz);
   1278   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
   1279 
   1280   mask1 = mask0 + 2;
   1281 
   1282   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
   1283   src += (5 * src_stride);
   1284 
   1285   XORI_B5_128_SB(src0, src1, src2, src3, src4);
   1286   hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
   1287   hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
   1288   hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
   1289   hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
   1290   hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
   1291   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
   1292   ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
   1293 
   1294   filt = LD_SH(filter_vert);
   1295   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
   1296 
   1297   for (loop_cnt = (height >> 2); loop_cnt--;) {
   1298     LD_SB4(src, src_stride, src5, src6, src7, src8);
   1299     src += (4 * src_stride);
   1300 
   1301     XORI_B4_128_SB(src5, src6, src7, src8);
   1302 
   1303     hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
   1304     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
   1305     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
   1306 
   1307     hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
   1308     out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
   1309     tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
   1310 
   1311     hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
   1312     out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
   1313     tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
   1314 
   1315     hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
   1316     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
   1317     tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
   1318 
   1319     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
   1320     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
   1321     vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
   1322     vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
   1323     ST8x4_UB(vec0, vec1, dst, dst_stride);
   1324     dst += (4 * dst_stride);
   1325 
   1326     hz_out4 = hz_out8;
   1327     out0 = out2;
   1328     out1 = out6;
   1329     out3 = out5;
   1330     out4 = out7;
   1331   }
   1332 }
   1333 
   1334 static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1335                                       uint8_t *RESTRICT dst, int32_t dst_stride,
   1336                                       const int8_t *filter_horiz,
   1337                                       const int8_t *filter_vert,
   1338                                       int32_t height) {
   1339   int32_t multiple8_cnt;
   1340   for (multiple8_cnt = 2; multiple8_cnt--;) {
   1341     common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
   1342                              filter_vert, height);
   1343     src += 8;
   1344     dst += 8;
   1345   }
   1346 }
   1347 
   1348 void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1349                                int32_t xoffset, int32_t yoffset,
   1350                                uint8_t *RESTRICT dst, int32_t dst_stride) {
   1351   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
   1352   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
   1353 
   1354   if (yoffset) {
   1355     if (xoffset) {
   1356       switch (xoffset) {
   1357         case 2:
   1358         case 4:
   1359         case 6:
   1360           switch (yoffset) {
   1361             case 2:
   1362             case 4:
   1363             case 6:
   1364               common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
   1365                                        h_filter, v_filter, 4);
   1366               break;
   1367 
   1368             case 1:
   1369             case 3:
   1370             case 5:
   1371             case 7:
   1372               common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
   1373                                        h_filter, v_filter + 1, 4);
   1374               break;
   1375           }
   1376           break;
   1377 
   1378         case 1:
   1379         case 3:
   1380         case 5:
   1381         case 7:
   1382           switch (yoffset) {
   1383             case 2:
   1384             case 4:
   1385             case 6:
   1386               common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
   1387                                        h_filter + 1, v_filter, 4);
   1388               break;
   1389 
   1390             case 1:
   1391             case 3:
   1392             case 5:
   1393             case 7:
   1394               common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
   1395                                        h_filter + 1, v_filter + 1, 4);
   1396               break;
   1397           }
   1398           break;
   1399       }
   1400     } else {
   1401       switch (yoffset) {
   1402         case 2:
   1403         case 4:
   1404         case 6:
   1405           common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
   1406           break;
   1407 
   1408         case 1:
   1409         case 3:
   1410         case 5:
   1411         case 7:
   1412           common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
   1413                               4);
   1414           break;
   1415       }
   1416     }
   1417   } else {
   1418     switch (xoffset) {
   1419       case 0: {
   1420         uint32_t tp0, tp1, tp2, tp3;
   1421 
   1422         LW4(src, src_stride, tp0, tp1, tp2, tp3);
   1423         SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
   1424         break;
   1425       }
   1426       case 2:
   1427       case 4:
   1428       case 6:
   1429         common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
   1430         break;
   1431 
   1432       case 1:
   1433       case 3:
   1434       case 5:
   1435       case 7:
   1436         common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
   1437         break;
   1438     }
   1439   }
   1440 }
   1441 
   1442 void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1443                                int32_t xoffset, int32_t yoffset,
   1444                                uint8_t *RESTRICT dst, int32_t dst_stride) {
   1445   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
   1446   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
   1447 
   1448   if (yoffset) {
   1449     if (xoffset) {
   1450       switch (xoffset) {
   1451         case 2:
   1452         case 4:
   1453         case 6:
   1454           switch (yoffset) {
   1455             case 2:
   1456             case 4:
   1457             case 6:
   1458               common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
   1459                                        h_filter, v_filter, 4);
   1460               break;
   1461 
   1462             case 1:
   1463             case 3:
   1464             case 5:
   1465             case 7:
   1466               common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
   1467                                        h_filter, v_filter + 1, 4);
   1468               break;
   1469           }
   1470           break;
   1471 
   1472         case 1:
   1473         case 3:
   1474         case 5:
   1475         case 7:
   1476           switch (yoffset) {
   1477             case 2:
   1478             case 4:
   1479             case 6:
   1480               common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
   1481                                        h_filter + 1, v_filter, 4);
   1482               break;
   1483 
   1484             case 1:
   1485             case 3:
   1486             case 5:
   1487             case 7:
   1488               common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
   1489                                        h_filter + 1, v_filter + 1, 4);
   1490               break;
   1491           }
   1492           break;
   1493       }
   1494     } else {
   1495       switch (yoffset) {
   1496         case 2:
   1497         case 4:
   1498         case 6:
   1499           common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
   1500           break;
   1501 
   1502         case 1:
   1503         case 3:
   1504         case 5:
   1505         case 7:
   1506           common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
   1507                               4);
   1508           break;
   1509       }
   1510     }
   1511   } else {
   1512     switch (xoffset) {
   1513       case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break;
   1514       case 2:
   1515       case 4:
   1516       case 6:
   1517         common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
   1518         break;
   1519 
   1520       case 1:
   1521       case 3:
   1522       case 5:
   1523       case 7:
   1524         common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
   1525         break;
   1526     }
   1527   }
   1528 }
   1529 
   1530 void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1531                                int32_t xoffset, int32_t yoffset,
   1532                                uint8_t *RESTRICT dst, int32_t dst_stride) {
   1533   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
   1534   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
   1535 
   1536   if (yoffset) {
   1537     if (xoffset) {
   1538       switch (xoffset) {
   1539         case 2:
   1540         case 4:
   1541         case 6:
   1542           switch (yoffset) {
   1543             case 2:
   1544             case 4:
   1545             case 6:
   1546               common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
   1547                                        h_filter, v_filter, 8);
   1548               break;
   1549 
   1550             case 1:
   1551             case 3:
   1552             case 5:
   1553             case 7:
   1554               common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
   1555                                        h_filter, v_filter + 1, 8);
   1556               break;
   1557           }
   1558           break;
   1559 
   1560         case 1:
   1561         case 3:
   1562         case 5:
   1563         case 7:
   1564           switch (yoffset) {
   1565             case 2:
   1566             case 4:
   1567             case 6:
   1568               common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
   1569                                        h_filter + 1, v_filter, 8);
   1570               break;
   1571 
   1572             case 1:
   1573             case 3:
   1574             case 5:
   1575             case 7:
   1576               common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
   1577                                        h_filter + 1, v_filter + 1, 8);
   1578               break;
   1579           }
   1580           break;
   1581       }
   1582     } else {
   1583       switch (yoffset) {
   1584         case 2:
   1585         case 4:
   1586         case 6:
   1587           common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
   1588           break;
   1589 
   1590         case 1:
   1591         case 3:
   1592         case 5:
   1593         case 7:
   1594           common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
   1595                               8);
   1596           break;
   1597       }
   1598     }
   1599   } else {
   1600     switch (xoffset) {
   1601       case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
   1602       case 2:
   1603       case 4:
   1604       case 6:
   1605         common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
   1606         break;
   1607 
   1608       case 1:
   1609       case 3:
   1610       case 5:
   1611       case 7:
   1612         common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8);
   1613         break;
   1614     }
   1615   }
   1616 }
   1617 
   1618 void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
   1619                                  int32_t xoffset, int32_t yoffset,
   1620                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
   1621   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
   1622   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
   1623 
   1624   if (yoffset) {
   1625     if (xoffset) {
   1626       switch (xoffset) {
   1627         case 2:
   1628         case 4:
   1629         case 6:
   1630           switch (yoffset) {
   1631             case 2:
   1632             case 4:
   1633             case 6:
   1634               common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
   1635                                         h_filter, v_filter, 16);
   1636               break;
   1637 
   1638             case 1:
   1639             case 3:
   1640             case 5:
   1641             case 7:
   1642               common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
   1643                                         h_filter, v_filter + 1, 16);
   1644               break;
   1645           }
   1646           break;
   1647 
   1648         case 1:
   1649         case 3:
   1650         case 5:
   1651         case 7:
   1652           switch (yoffset) {
   1653             case 2:
   1654             case 4:
   1655             case 6:
   1656               common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
   1657                                         h_filter + 1, v_filter, 16);
   1658               break;
   1659 
   1660             case 1:
   1661             case 3:
   1662             case 5:
   1663             case 7:
   1664               common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
   1665                                         h_filter + 1, v_filter + 1, 16);
   1666               break;
   1667           }
   1668           break;
   1669       }
   1670     } else {
   1671       switch (yoffset) {
   1672         case 2:
   1673         case 4:
   1674         case 6:
   1675           common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
   1676           break;
   1677 
   1678         case 1:
   1679         case 3:
   1680         case 5:
   1681         case 7:
   1682           common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
   1683                                16);
   1684           break;
   1685       }
   1686     }
   1687   } else {
   1688     switch (xoffset) {
   1689       case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
   1690       case 2:
   1691       case 4:
   1692       case 6:
   1693         common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
   1694         break;
   1695 
   1696       case 1:
   1697       case 3:
   1698       case 5:
   1699       case 7:
   1700         common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1,
   1701                              16);
   1702         break;
   1703     }
   1704   }
   1705 }
   1706