Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include "./vpx_dsp_rtcd.h"
     13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
     14 
     15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
     16                                  uint8_t *dst, int32_t dst_stride,
     17                                  int8_t *filter) {
     18   v16u8 mask0, mask1, mask2, mask3, out;
     19   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
     20   v8i16 filt, out0, out1;
     21 
     22   mask0 = LD_UB(&mc_filt_mask_arr[16]);
     23   src -= 3;
     24 
     25   /* rearranging filter */
     26   filt = LD_SH(filter);
     27   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
     28 
     29   mask1 = mask0 + 2;
     30   mask2 = mask0 + 4;
     31   mask3 = mask0 + 6;
     32 
     33   LD_SB4(src, src_stride, src0, src1, src2, src3);
     34   XORI_B4_128_SB(src0, src1, src2, src3);
     35   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
     36                              filt0, filt1, filt2, filt3, out0, out1);
     37   SRARI_H2_SH(out0, out1, FILTER_BITS);
     38   SAT_SH2_SH(out0, out1, 7);
     39   out = PCKEV_XORI128_UB(out0, out1);
     40   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
     41 }
     42 
     43 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
     44                                  uint8_t *dst, int32_t dst_stride,
     45                                  int8_t *filter) {
     46   v16i8 filt0, filt1, filt2, filt3;
     47   v16i8 src0, src1, src2, src3;
     48   v16u8 mask0, mask1, mask2, mask3, out;
     49   v8i16 filt, out0, out1, out2, out3;
     50 
     51   mask0 = LD_UB(&mc_filt_mask_arr[16]);
     52   src -= 3;
     53 
     54   /* rearranging filter */
     55   filt = LD_SH(filter);
     56   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
     57 
     58   mask1 = mask0 + 2;
     59   mask2 = mask0 + 4;
     60   mask3 = mask0 + 6;
     61 
     62   LD_SB4(src, src_stride, src0, src1, src2, src3);
     63   XORI_B4_128_SB(src0, src1, src2, src3);
     64   src += (4 * src_stride);
     65   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
     66                              filt0, filt1, filt2, filt3, out0, out1);
     67   LD_SB4(src, src_stride, src0, src1, src2, src3);
     68   XORI_B4_128_SB(src0, src1, src2, src3);
     69   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
     70                              filt0, filt1, filt2, filt3, out2, out3);
     71   SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     72   SAT_SH4_SH(out0, out1, out2, out3, 7);
     73   out = PCKEV_XORI128_UB(out0, out1);
     74   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
     75   dst += (4 * dst_stride);
     76   out = PCKEV_XORI128_UB(out2, out3);
     77   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
     78 }
     79 
     80 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
     81                                 uint8_t *dst, int32_t dst_stride,
     82                                 int8_t *filter, int32_t height) {
     83   if (4 == height) {
     84     common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
     85   } else if (8 == height) {
     86     common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
     87   }
     88 }
     89 
     90 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
     91                                  uint8_t *dst, int32_t dst_stride,
     92                                  int8_t *filter) {
     93   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
     94   v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
     95   v8i16 filt, out0, out1, out2, out3;
     96 
     97   mask0 = LD_UB(&mc_filt_mask_arr[0]);
     98   src -= 3;
     99 
    100   /* rearranging filter */
    101   filt = LD_SH(filter);
    102   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    103 
    104   mask1 = mask0 + 2;
    105   mask2 = mask0 + 4;
    106   mask3 = mask0 + 6;
    107 
    108   LD_SB4(src, src_stride, src0, src1, src2, src3);
    109   XORI_B4_128_SB(src0, src1, src2, src3);
    110   HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
    111                              filt0, filt1, filt2, filt3, out0, out1, out2,
    112                              out3);
    113   SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    114   SAT_SH4_SH(out0, out1, out2, out3, 7);
    115   tmp0 = PCKEV_XORI128_UB(out0, out1);
    116   tmp1 = PCKEV_XORI128_UB(out2, out3);
    117   ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    118 }
    119 
    120 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
    121                                      uint8_t *dst, int32_t dst_stride,
    122                                      int8_t *filter, int32_t height) {
    123   uint32_t loop_cnt;
    124   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    125   v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
    126   v8i16 filt, out0, out1, out2, out3;
    127 
    128   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    129   src -= 3;
    130 
    131   /* rearranging filter */
    132   filt = LD_SH(filter);
    133   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    134 
    135   mask1 = mask0 + 2;
    136   mask2 = mask0 + 4;
    137   mask3 = mask0 + 6;
    138 
    139   for (loop_cnt = (height >> 2); loop_cnt--;) {
    140     LD_SB4(src, src_stride, src0, src1, src2, src3);
    141     XORI_B4_128_SB(src0, src1, src2, src3);
    142     src += (4 * src_stride);
    143     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    144                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    145                                out2, out3);
    146     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    147     SAT_SH4_SH(out0, out1, out2, out3, 7);
    148     tmp0 = PCKEV_XORI128_UB(out0, out1);
    149     tmp1 = PCKEV_XORI128_UB(out2, out3);
    150     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    151     dst += (4 * dst_stride);
    152   }
    153 }
    154 
    155 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
    156                                 uint8_t *dst, int32_t dst_stride,
    157                                 int8_t *filter, int32_t height) {
    158   if (4 == height) {
    159     common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
    160   } else {
    161     common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
    162   }
    163 }
    164 
    165 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
    166                                  uint8_t *dst, int32_t dst_stride,
    167                                  int8_t *filter, int32_t height) {
    168   uint32_t loop_cnt;
    169   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    170   v16u8 mask0, mask1, mask2, mask3, out;
    171   v8i16 filt, out0, out1, out2, out3;
    172 
    173   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    174   src -= 3;
    175 
    176   /* rearranging filter */
    177   filt = LD_SH(filter);
    178   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    179 
    180   mask1 = mask0 + 2;
    181   mask2 = mask0 + 4;
    182   mask3 = mask0 + 6;
    183 
    184   for (loop_cnt = (height >> 1); loop_cnt--;) {
    185     LD_SB2(src, src_stride, src0, src2);
    186     LD_SB2(src + 8, src_stride, src1, src3);
    187     XORI_B4_128_SB(src0, src1, src2, src3);
    188     src += (2 * src_stride);
    189     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    190                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    191                                out2, out3);
    192     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    193     SAT_SH4_SH(out0, out1, out2, out3, 7);
    194     out = PCKEV_XORI128_UB(out0, out1);
    195     ST_UB(out, dst);
    196     dst += dst_stride;
    197     out = PCKEV_XORI128_UB(out2, out3);
    198     ST_UB(out, dst);
    199     dst += dst_stride;
    200   }
    201 }
    202 
    203 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
    204                                  uint8_t *dst, int32_t dst_stride,
    205                                  int8_t *filter, int32_t height) {
    206   uint32_t loop_cnt;
    207   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    208   v16u8 mask0, mask1, mask2, mask3, out;
    209   v8i16 filt, out0, out1, out2, out3;
    210 
    211   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    212   src -= 3;
    213 
    214   /* rearranging filter */
    215   filt = LD_SH(filter);
    216   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    217 
    218   mask1 = mask0 + 2;
    219   mask2 = mask0 + 4;
    220   mask3 = mask0 + 6;
    221 
    222   for (loop_cnt = (height >> 1); loop_cnt--;) {
    223     src0 = LD_SB(src);
    224     src2 = LD_SB(src + 16);
    225     src3 = LD_SB(src + 24);
    226     src1 = __msa_sldi_b(src2, src0, 8);
    227     src += src_stride;
    228     XORI_B4_128_SB(src0, src1, src2, src3);
    229     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    230                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    231                                out2, out3);
    232     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    233     SAT_SH4_SH(out0, out1, out2, out3, 7);
    234 
    235     src0 = LD_SB(src);
    236     src2 = LD_SB(src + 16);
    237     src3 = LD_SB(src + 24);
    238     src1 = __msa_sldi_b(src2, src0, 8);
    239     src += src_stride;
    240 
    241     out = PCKEV_XORI128_UB(out0, out1);
    242     ST_UB(out, dst);
    243     out = PCKEV_XORI128_UB(out2, out3);
    244     ST_UB(out, dst + 16);
    245     dst += dst_stride;
    246 
    247     XORI_B4_128_SB(src0, src1, src2, src3);
    248     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    249                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    250                                out2, out3);
    251     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    252     SAT_SH4_SH(out0, out1, out2, out3, 7);
    253     out = PCKEV_XORI128_UB(out0, out1);
    254     ST_UB(out, dst);
    255     out = PCKEV_XORI128_UB(out2, out3);
    256     ST_UB(out, dst + 16);
    257     dst += dst_stride;
    258   }
    259 }
    260 
    261 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
    262                                  uint8_t *dst, int32_t dst_stride,
    263                                  int8_t *filter, int32_t height) {
    264   int32_t loop_cnt;
    265   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    266   v16u8 mask0, mask1, mask2, mask3, out;
    267   v8i16 filt, out0, out1, out2, out3;
    268 
    269   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    270   src -= 3;
    271 
    272   /* rearranging filter */
    273   filt = LD_SH(filter);
    274   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    275 
    276   mask1 = mask0 + 2;
    277   mask2 = mask0 + 4;
    278   mask3 = mask0 + 6;
    279 
    280   for (loop_cnt = height; loop_cnt--;) {
    281     src0 = LD_SB(src);
    282     src2 = LD_SB(src + 16);
    283     src3 = LD_SB(src + 24);
    284     src1 = __msa_sldi_b(src2, src0, 8);
    285 
    286     XORI_B4_128_SB(src0, src1, src2, src3);
    287     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    288                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    289                                out2, out3);
    290     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    291     SAT_SH4_SH(out0, out1, out2, out3, 7);
    292     out = PCKEV_XORI128_UB(out0, out1);
    293     ST_UB(out, dst);
    294     out = PCKEV_XORI128_UB(out2, out3);
    295     ST_UB(out, dst + 16);
    296 
    297     src0 = LD_SB(src + 32);
    298     src2 = LD_SB(src + 48);
    299     src3 = LD_SB(src + 56);
    300     src1 = __msa_sldi_b(src2, src0, 8);
    301     src += src_stride;
    302 
    303     XORI_B4_128_SB(src0, src1, src2, src3);
    304     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    305                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    306                                out2, out3);
    307     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    308     SAT_SH4_SH(out0, out1, out2, out3, 7);
    309     out = PCKEV_XORI128_UB(out0, out1);
    310     ST_UB(out, dst + 32);
    311     out = PCKEV_XORI128_UB(out2, out3);
    312     ST_UB(out, dst + 48);
    313     dst += dst_stride;
    314   }
    315 }
    316 
    317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
    318                                  uint8_t *dst, int32_t dst_stride,
    319                                  int8_t *filter) {
    320   v16i8 src0, src1, src2, src3, mask;
    321   v16u8 filt0, vec0, vec1, res0, res1;
    322   v8u16 vec2, vec3, filt;
    323 
    324   mask = LD_SB(&mc_filt_mask_arr[16]);
    325 
    326   /* rearranging filter */
    327   filt = LD_UH(filter);
    328   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    329 
    330   LD_SB4(src, src_stride, src0, src1, src2, src3);
    331   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
    332   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
    333   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
    334   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
    335   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    336 }
    337 
    338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
    339                                  uint8_t *dst, int32_t dst_stride,
    340                                  int8_t *filter) {
    341   v16u8 vec0, vec1, vec2, vec3, filt0;
    342   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    343   v16i8 res0, res1, res2, res3;
    344   v8u16 vec4, vec5, vec6, vec7, filt;
    345 
    346   mask = LD_SB(&mc_filt_mask_arr[16]);
    347 
    348   /* rearranging filter */
    349   filt = LD_UH(filter);
    350   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    351 
    352   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    353   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
    354   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
    355   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
    356               vec6, vec7);
    357   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
    358   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
    359               res3);
    360   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    361   dst += (4 * dst_stride);
    362   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
    363 }
    364 
    365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
    366                                 uint8_t *dst, int32_t dst_stride,
    367                                 int8_t *filter, int32_t height) {
    368   if (4 == height) {
    369     common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
    370   } else if (8 == height) {
    371     common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
    372   }
    373 }
    374 
    375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
    376                                  uint8_t *dst, int32_t dst_stride,
    377                                  int8_t *filter) {
    378   v16u8 filt0;
    379   v16i8 src0, src1, src2, src3, mask;
    380   v8u16 vec0, vec1, vec2, vec3, filt;
    381 
    382   mask = LD_SB(&mc_filt_mask_arr[0]);
    383 
    384   /* rearranging filter */
    385   filt = LD_UH(filter);
    386   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    387 
    388   LD_SB4(src, src_stride, src0, src1, src2, src3);
    389   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    390   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    391   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    392               vec2, vec3);
    393   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    394   PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
    395   ST8x4_UB(src0, src1, dst, dst_stride);
    396 }
    397 
    398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
    399                                      uint8_t *dst, int32_t dst_stride,
    400                                      int8_t *filter, int32_t height) {
    401   v16u8 filt0;
    402   v16i8 src0, src1, src2, src3, mask, out0, out1;
    403   v8u16 vec0, vec1, vec2, vec3, filt;
    404 
    405   mask = LD_SB(&mc_filt_mask_arr[0]);
    406 
    407   /* rearranging filter */
    408   filt = LD_UH(filter);
    409   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    410 
    411   LD_SB4(src, src_stride, src0, src1, src2, src3);
    412   src += (4 * src_stride);
    413 
    414   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    415   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    416   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    417               vec2, vec3);
    418   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    419 
    420   LD_SB4(src, src_stride, src0, src1, src2, src3);
    421   src += (4 * src_stride);
    422 
    423   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    424   ST8x4_UB(out0, out1, dst, dst_stride);
    425   dst += (4 * dst_stride);
    426 
    427   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    428   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    429   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    430               vec2, vec3);
    431   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    432   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    433   ST8x4_UB(out0, out1, dst, dst_stride);
    434   dst += (4 * dst_stride);
    435 
    436   if (16 == height) {
    437     LD_SB4(src, src_stride, src0, src1, src2, src3);
    438     src += (4 * src_stride);
    439 
    440     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    441     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    442     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    443                 vec2, vec3);
    444     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    445     LD_SB4(src, src_stride, src0, src1, src2, src3);
    446     src += (4 * src_stride);
    447 
    448     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    449     ST8x4_UB(out0, out1, dst, dst_stride);
    450 
    451     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    452     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    453     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    454                 vec2, vec3);
    455     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    456     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    457     ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
    458   }
    459 }
    460 
    461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
    462                                 uint8_t *dst, int32_t dst_stride,
    463                                 int8_t *filter, int32_t height) {
    464   if (4 == height) {
    465     common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
    466   } else {
    467     common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
    468   }
    469 }
    470 
    471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
    472                                  uint8_t *dst, int32_t dst_stride,
    473                                  int8_t *filter, int32_t height) {
    474   uint32_t loop_cnt;
    475   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    476   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    477   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
    478 
    479   mask = LD_SB(&mc_filt_mask_arr[0]);
    480 
    481   loop_cnt = (height >> 2) - 1;
    482 
    483   /* rearranging filter */
    484   filt = LD_UH(filter);
    485   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    486 
    487   LD_SB4(src, src_stride, src0, src2, src4, src6);
    488   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    489   src += (4 * src_stride);
    490 
    491   VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    492   VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    493   VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    494   VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    495   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    496               out2, out3);
    497   DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    498               out6, out7);
    499   SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
    500   SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
    501   PCKEV_ST_SB(out0, out1, dst);
    502   dst += dst_stride;
    503   PCKEV_ST_SB(out2, out3, dst);
    504   dst += dst_stride;
    505   PCKEV_ST_SB(out4, out5, dst);
    506   dst += dst_stride;
    507   PCKEV_ST_SB(out6, out7, dst);
    508   dst += dst_stride;
    509 
    510   for (; loop_cnt--;) {
    511     LD_SB4(src, src_stride, src0, src2, src4, src6);
    512     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    513     src += (4 * src_stride);
    514 
    515     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    516     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    517     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    518     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    519     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    520                 out2, out3);
    521     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    522                 out6, out7);
    523     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
    524     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
    525     PCKEV_ST_SB(out0, out1, dst);
    526     dst += dst_stride;
    527     PCKEV_ST_SB(out2, out3, dst);
    528     dst += dst_stride;
    529     PCKEV_ST_SB(out4, out5, dst);
    530     dst += dst_stride;
    531     PCKEV_ST_SB(out6, out7, dst);
    532     dst += dst_stride;
    533   }
    534 }
    535 
    536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
    537                                  uint8_t *dst, int32_t dst_stride,
    538                                  int8_t *filter, int32_t height) {
    539   uint32_t loop_cnt;
    540   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    541   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    542   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
    543 
    544   mask = LD_SB(&mc_filt_mask_arr[0]);
    545 
    546   /* rearranging filter */
    547   filt = LD_UH(filter);
    548   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    549 
    550   for (loop_cnt = height >> 1; loop_cnt--;) {
    551     src0 = LD_SB(src);
    552     src2 = LD_SB(src + 16);
    553     src3 = LD_SB(src + 24);
    554     src1 = __msa_sldi_b(src2, src0, 8);
    555     src += src_stride;
    556     src4 = LD_SB(src);
    557     src6 = LD_SB(src + 16);
    558     src7 = LD_SB(src + 24);
    559     src5 = __msa_sldi_b(src6, src4, 8);
    560     src += src_stride;
    561 
    562     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    563     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    564     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    565     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    566     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    567                 out2, out3);
    568     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    569                 out6, out7);
    570     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
    571     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
    572     PCKEV_ST_SB(out0, out1, dst);
    573     PCKEV_ST_SB(out2, out3, dst + 16);
    574     dst += dst_stride;
    575     PCKEV_ST_SB(out4, out5, dst);
    576     PCKEV_ST_SB(out6, out7, dst + 16);
    577     dst += dst_stride;
    578   }
    579 }
    580 
    581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
    582                                  uint8_t *dst, int32_t dst_stride,
    583                                  int8_t *filter, int32_t height) {
    584   uint32_t loop_cnt;
    585   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    586   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    587   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
    588 
    589   mask = LD_SB(&mc_filt_mask_arr[0]);
    590 
    591   /* rearranging filter */
    592   filt = LD_UH(filter);
    593   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    594 
    595   for (loop_cnt = height; loop_cnt--;) {
    596     src0 = LD_SB(src);
    597     src2 = LD_SB(src + 16);
    598     src4 = LD_SB(src + 32);
    599     src6 = LD_SB(src + 48);
    600     src7 = LD_SB(src + 56);
    601     SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
    602     src += src_stride;
    603 
    604     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    605     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    606     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    607     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    608     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    609                 out2, out3);
    610     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    611                 out6, out7);
    612     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
    613     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
    614     PCKEV_ST_SB(out0, out1, dst);
    615     PCKEV_ST_SB(out2, out3, dst + 16);
    616     PCKEV_ST_SB(out4, out5, dst + 32);
    617     PCKEV_ST_SB(out6, out7, dst + 48);
    618     dst += dst_stride;
    619   }
    620 }
    621 
    622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
    623                              uint8_t *dst, ptrdiff_t dst_stride,
    624                              const InterpKernel *filter, int x0_q4,
    625                              int x_step_q4, int y0_q4, int y_step_q4, int w,
    626                              int h) {
    627   const int16_t *const filter_x = filter[x0_q4];
    628   int8_t cnt, filt_hor[8];
    629 
    630   assert(x_step_q4 == 16);
    631   assert(((const int32_t *)filter_x)[1] != 0x800000);
    632 
    633   for (cnt = 0; cnt < 8; ++cnt) {
    634     filt_hor[cnt] = filter_x[cnt];
    635   }
    636 
    637   if (((const int32_t *)filter_x)[0] == 0) {
    638     switch (w) {
    639       case 4:
    640         common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    641                             &filt_hor[3], h);
    642         break;
    643       case 8:
    644         common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    645                             &filt_hor[3], h);
    646         break;
    647       case 16:
    648         common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    649                              &filt_hor[3], h);
    650         break;
    651       case 32:
    652         common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    653                              &filt_hor[3], h);
    654         break;
    655       case 64:
    656         common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    657                              &filt_hor[3], h);
    658         break;
    659       default:
    660         vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
    661                               x_step_q4, y0_q4, y_step_q4, w, h);
    662         break;
    663     }
    664   } else {
    665     switch (w) {
    666       case 4:
    667         common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    668                             filt_hor, h);
    669         break;
    670       case 8:
    671         common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    672                             filt_hor, h);
    673         break;
    674       case 16:
    675         common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    676                              filt_hor, h);
    677         break;
    678       case 32:
    679         common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    680                              filt_hor, h);
    681         break;
    682       case 64:
    683         common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
    684                              filt_hor, h);
    685         break;
    686       default:
    687         vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
    688                               x_step_q4, y0_q4, y_step_q4, w, h);
    689         break;
    690     }
    691   }
    692 }
    693