Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include "./vpx_dsp_rtcd.h"
     13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
     14 
     15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
     16                                               int32_t src_stride, uint8_t *dst,
     17                                               int32_t dst_stride,
     18                                               int8_t *filter) {
     19   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
     20   v16u8 dst0, dst1, dst2, dst3, res2, res3;
     21   v16u8 mask0, mask1, mask2, mask3;
     22   v8i16 filt, res0, res1;
     23 
     24   mask0 = LD_UB(&mc_filt_mask_arr[16]);
     25   src -= 3;
     26 
     27   /* rearranging filter */
     28   filt = LD_SH(filter);
     29   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
     30 
     31   mask1 = mask0 + 2;
     32   mask2 = mask0 + 4;
     33   mask3 = mask0 + 6;
     34 
     35   LD_SB4(src, src_stride, src0, src1, src2, src3);
     36   XORI_B4_128_SB(src0, src1, src2, src3);
     37   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
     38                              filt0, filt1, filt2, filt3, res0, res1);
     39   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
     40   SRARI_H2_SH(res0, res1, FILTER_BITS);
     41   SAT_SH2_SH(res0, res1, 7);
     42   PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
     43   ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
     44   XORI_B2_128_UB(res2, res3);
     45   AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
     46   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
     47 }
     48 
     49 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
     50                                               int32_t src_stride, uint8_t *dst,
     51                                               int32_t dst_stride,
     52                                               int8_t *filter) {
     53   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
     54   v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
     55   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
     56   v8i16 filt, vec0, vec1, vec2, vec3;
     57 
     58   mask0 = LD_UB(&mc_filt_mask_arr[16]);
     59   src -= 3;
     60 
     61   /* rearranging filter */
     62   filt = LD_SH(filter);
     63   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
     64 
     65   mask1 = mask0 + 2;
     66   mask2 = mask0 + 4;
     67   mask3 = mask0 + 6;
     68 
     69   LD_SB4(src, src_stride, src0, src1, src2, src3);
     70   XORI_B4_128_SB(src0, src1, src2, src3);
     71   src += (4 * src_stride);
     72   LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
     73   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
     74                              filt0, filt1, filt2, filt3, vec0, vec1);
     75   LD_SB4(src, src_stride, src0, src1, src2, src3);
     76   XORI_B4_128_SB(src0, src1, src2, src3);
     77   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
     78                              filt0, filt1, filt2, filt3, vec2, vec3);
     79   SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
     80   SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
     81   PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
     82               res3);
     83   ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
     84   XORI_B2_128_UB(res0, res2);
     85   ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
     86              dst6);
     87   ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
     88   AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
     89   ST4x8_UB(res0, res2, dst, dst_stride);
     90 }
     91 
     92 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
     93                                              int32_t src_stride, uint8_t *dst,
     94                                              int32_t dst_stride, int8_t *filter,
     95                                              int32_t height) {
     96   if (4 == height) {
     97     common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
     98   } else if (8 == height) {
     99     common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
    100   }
    101 }
    102 
    103 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
    104                                              int32_t src_stride, uint8_t *dst,
    105                                              int32_t dst_stride, int8_t *filter,
    106                                              int32_t height) {
    107   int32_t loop_cnt;
    108   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    109   v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
    110   v8i16 filt, out0, out1, out2, out3;
    111 
    112   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    113   src -= 3;
    114 
    115   /* rearranging filter */
    116   filt = LD_SH(filter);
    117   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    118 
    119   mask1 = mask0 + 2;
    120   mask2 = mask0 + 4;
    121   mask3 = mask0 + 6;
    122 
    123   for (loop_cnt = (height >> 2); loop_cnt--;) {
    124     LD_SB4(src, src_stride, src0, src1, src2, src3);
    125     XORI_B4_128_SB(src0, src1, src2, src3);
    126     src += (4 * src_stride);
    127     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
    128                                mask3, filt0, filt1, filt2, filt3, out0, out1,
    129                                out2, out3);
    130     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    131     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    132     SAT_SH4_SH(out0, out1, out2, out3, 7);
    133     CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
    134                             dst_stride);
    135     dst += (4 * dst_stride);
    136   }
    137 }
    138 
    139 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
    140                                               int32_t src_stride, uint8_t *dst,
    141                                               int32_t dst_stride,
    142                                               int8_t *filter, int32_t height) {
    143   int32_t loop_cnt;
    144   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    145   v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
    146   v8i16 filt, out0, out1, out2, out3;
    147   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    148   v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
    149 
    150   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    151   src -= 3;
    152 
    153   /* rearranging filter */
    154   filt = LD_SH(filter);
    155   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    156 
    157   mask1 = mask0 + 2;
    158   mask2 = mask0 + 4;
    159   mask3 = mask0 + 6;
    160 
    161   for (loop_cnt = height >> 1; loop_cnt--;) {
    162     LD_SB2(src, src_stride, src0, src2);
    163     LD_SB2(src + 8, src_stride, src1, src3);
    164     src += (2 * src_stride);
    165 
    166     XORI_B4_128_SB(src0, src1, src2, src3);
    167     VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
    168     VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
    169     VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
    170                vec14);
    171     VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
    172                vec15);
    173     DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    174                 vec2, vec3);
    175     DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
    176                 vec9, vec10, vec11);
    177     DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
    178                  vec2, vec3);
    179     DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
    180                  vec9, vec10, vec11);
    181     ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
    182                 out2, out3);
    183     LD_UB2(dst, dst_stride, dst0, dst1);
    184     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    185     SAT_SH4_SH(out0, out1, out2, out3, 7);
    186     PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
    187     dst += dst_stride;
    188     PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
    189     dst += dst_stride;
    190   }
    191 }
    192 
    193 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
    194                                               int32_t src_stride, uint8_t *dst,
    195                                               int32_t dst_stride,
    196                                               int8_t *filter, int32_t height) {
    197   uint32_t loop_cnt;
    198   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    199   v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
    200   v8i16 filt, out0, out1, out2, out3;
    201   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    202   v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
    203 
    204   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    205   src -= 3;
    206 
    207   /* rearranging filter */
    208   filt = LD_SH(filter);
    209   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    210 
    211   mask1 = mask0 + 2;
    212   mask2 = mask0 + 4;
    213   mask3 = mask0 + 6;
    214 
    215   for (loop_cnt = height; loop_cnt--;) {
    216     src0 = LD_SB(src);
    217     src2 = LD_SB(src + 16);
    218     src3 = LD_SB(src + 24);
    219     src1 = __msa_sldi_b(src2, src0, 8);
    220     src += src_stride;
    221 
    222     XORI_B4_128_SB(src0, src1, src2, src3);
    223     VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
    224     VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
    225     VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
    226                vec14);
    227     VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
    228                vec15);
    229     DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    230                 vec2, vec3);
    231     DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
    232                 vec9, vec10, vec11);
    233     DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
    234                  vec2, vec3);
    235     DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
    236                  vec9, vec10, vec11);
    237     ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
    238                 out2, out3);
    239     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    240     SAT_SH4_SH(out0, out1, out2, out3, 7);
    241     LD_UB2(dst, 16, dst1, dst2);
    242     PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
    243     PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
    244     dst += dst_stride;
    245   }
    246 }
    247 
    248 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
    249                                               int32_t src_stride, uint8_t *dst,
    250                                               int32_t dst_stride,
    251                                               int8_t *filter, int32_t height) {
    252   uint32_t loop_cnt, cnt;
    253   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
    254   v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
    255   v8i16 filt, out0, out1, out2, out3;
    256   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    257   v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
    258 
    259   mask0 = LD_UB(&mc_filt_mask_arr[0]);
    260   src -= 3;
    261 
    262   /* rearranging filter */
    263   filt = LD_SH(filter);
    264   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
    265 
    266   mask1 = mask0 + 2;
    267   mask2 = mask0 + 4;
    268   mask3 = mask0 + 6;
    269 
    270   for (loop_cnt = height; loop_cnt--;) {
    271     for (cnt = 0; cnt < 2; ++cnt) {
    272       src0 = LD_SB(&src[cnt << 5]);
    273       src2 = LD_SB(&src[16 + (cnt << 5)]);
    274       src3 = LD_SB(&src[24 + (cnt << 5)]);
    275       src1 = __msa_sldi_b(src2, src0, 8);
    276 
    277       XORI_B4_128_SB(src0, src1, src2, src3);
    278       VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
    279                  vec12);
    280       VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
    281                  vec13);
    282       VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
    283                  vec14);
    284       VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
    285                  vec15);
    286       DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
    287                   vec1, vec2, vec3);
    288       DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
    289                   vec9, vec10, vec11);
    290       DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
    291                    vec1, vec2, vec3);
    292       DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
    293                    vec9, vec10, vec11);
    294       ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
    295                   out2, out3);
    296       SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
    297       SAT_SH4_SH(out0, out1, out2, out3, 7);
    298       LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
    299       PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
    300       PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
    301     }
    302 
    303     src += src_stride;
    304     dst += dst_stride;
    305   }
    306 }
    307 
    308 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
    309                                               int32_t src_stride, uint8_t *dst,
    310                                               int32_t dst_stride,
    311                                               int8_t *filter) {
    312   v16i8 src0, src1, src2, src3, mask;
    313   v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
    314   v8u16 vec2, vec3, filt;
    315 
    316   mask = LD_SB(&mc_filt_mask_arr[16]);
    317 
    318   /* rearranging filter */
    319   filt = LD_UH(filter);
    320   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    321 
    322   LD_SB4(src, src_stride, src0, src1, src2, src3);
    323   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    324   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
    325   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
    326   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
    327   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
    328   ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
    329   AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
    330   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    331 }
    332 
    333 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
    334                                               int32_t src_stride, uint8_t *dst,
    335                                               int32_t dst_stride,
    336                                               int8_t *filter) {
    337   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    338   v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
    339   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    340   v8u16 vec4, vec5, vec6, vec7, filt;
    341 
    342   mask = LD_SB(&mc_filt_mask_arr[16]);
    343 
    344   /* rearranging filter */
    345   filt = LD_UH(filter);
    346   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    347 
    348   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    349   LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
    350   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
    351   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
    352   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
    353               vec6, vec7);
    354   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
    355   PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
    356               res3);
    357   ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
    358              dst6);
    359   AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
    360               res3);
    361   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    362   dst += (4 * dst_stride);
    363   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
    364 }
    365 
    366 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
    367                                              int32_t src_stride, uint8_t *dst,
    368                                              int32_t dst_stride, int8_t *filter,
    369                                              int32_t height) {
    370   if (4 == height) {
    371     common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
    372   } else if (8 == height) {
    373     common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
    374   }
    375 }
    376 
    377 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
    378                                               int32_t src_stride, uint8_t *dst,
    379                                               int32_t dst_stride,
    380                                               int8_t *filter) {
    381   v16i8 src0, src1, src2, src3, mask;
    382   v16u8 filt0, dst0, dst1, dst2, dst3;
    383   v8u16 vec0, vec1, vec2, vec3, filt;
    384 
    385   mask = LD_SB(&mc_filt_mask_arr[0]);
    386 
    387   /* rearranging filter */
    388   filt = LD_UH(filter);
    389   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    390 
    391   LD_SB4(src, src_stride, src0, src1, src2, src3);
    392   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    393   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    394   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    395               vec2, vec3);
    396   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    397   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    398   PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
    399                      dst_stride);
    400 }
    401 
    402 static void common_hz_2t_and_aver_dst_8x8mult_msa(
    403     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    404     int8_t *filter, int32_t height) {
    405   v16i8 src0, src1, src2, src3, mask;
    406   v16u8 filt0, dst0, dst1, dst2, dst3;
    407   v8u16 vec0, vec1, vec2, vec3, filt;
    408 
    409   mask = LD_SB(&mc_filt_mask_arr[0]);
    410 
    411   /* rearranging filter */
    412   filt = LD_UH(filter);
    413   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    414 
    415   LD_SB4(src, src_stride, src0, src1, src2, src3);
    416   src += (4 * src_stride);
    417   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    418   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    419   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    420               vec2, vec3);
    421   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    422   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    423   LD_SB4(src, src_stride, src0, src1, src2, src3);
    424   src += (4 * src_stride);
    425   PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
    426                      dst_stride);
    427   dst += (4 * dst_stride);
    428 
    429   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    430   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    431   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    432               vec2, vec3);
    433   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    434   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    435   PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
    436                      dst_stride);
    437   dst += (4 * dst_stride);
    438 
    439   if (16 == height) {
    440     LD_SB4(src, src_stride, src0, src1, src2, src3);
    441     src += (4 * src_stride);
    442 
    443     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    444     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    445     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    446                 vec2, vec3);
    447     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    448     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    449     LD_SB4(src, src_stride, src0, src1, src2, src3);
    450     PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
    451                        dst_stride);
    452     dst += (4 * dst_stride);
    453 
    454     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    455     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    456     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    457                 vec2, vec3);
    458     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
    459     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    460     PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
    461                        dst_stride);
    462   }
    463 }
    464 
    465 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
    466                                              int32_t src_stride, uint8_t *dst,
    467                                              int32_t dst_stride, int8_t *filter,
    468                                              int32_t height) {
    469   if (4 == height) {
    470     common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
    471   } else {
    472     common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
    473                                           filter, height);
    474   }
    475 }
    476 
    477 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
    478                                               int32_t src_stride, uint8_t *dst,
    479                                               int32_t dst_stride,
    480                                               int8_t *filter, int32_t height) {
    481   uint32_t loop_cnt;
    482   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    483   v16u8 filt0, dst0, dst1, dst2, dst3;
    484   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    485   v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
    486 
    487   mask = LD_SB(&mc_filt_mask_arr[0]);
    488 
    489   /* rearranging filter */
    490   filt = LD_UH(filter);
    491   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    492 
    493   LD_SB4(src, src_stride, src0, src2, src4, src6);
    494   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    495   src += (4 * src_stride);
    496 
    497   VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    498   VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    499   VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    500   VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    501   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
    502               res2, res3);
    503   DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
    504               res6, res7);
    505   SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
    506   SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
    507   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    508   PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
    509   dst += dst_stride;
    510   PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
    511   dst += dst_stride;
    512   PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
    513   dst += dst_stride;
    514   PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
    515   dst += dst_stride;
    516 
    517   for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
    518     LD_SB4(src, src_stride, src0, src2, src4, src6);
    519     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    520     src += (4 * src_stride);
    521 
    522     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    523     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    524     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    525     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    526     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
    527                 res2, res3);
    528     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
    529                 res6, res7);
    530     SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
    531     SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
    532     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    533     PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
    534     dst += dst_stride;
    535     PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
    536     dst += dst_stride;
    537     PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
    538     dst += dst_stride;
    539     PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
    540     dst += dst_stride;
    541   }
    542 }
    543 
    544 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
    545                                               int32_t src_stride, uint8_t *dst,
    546                                               int32_t dst_stride,
    547                                               int8_t *filter, int32_t height) {
    548   uint32_t loop_cnt;
    549   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    550   v16u8 filt0, dst0, dst1, dst2, dst3;
    551   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    552   v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
    553 
    554   mask = LD_SB(&mc_filt_mask_arr[0]);
    555 
    556   /* rearranging filter */
    557   filt = LD_UH(filter);
    558   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    559 
    560   for (loop_cnt = (height >> 1); loop_cnt--;) {
    561     src0 = LD_SB(src);
    562     src2 = LD_SB(src + 16);
    563     src3 = LD_SB(src + 24);
    564     src1 = __msa_sldi_b(src2, src0, 8);
    565     src += src_stride;
    566     src4 = LD_SB(src);
    567     src6 = LD_SB(src + 16);
    568     src7 = LD_SB(src + 24);
    569     src5 = __msa_sldi_b(src6, src4, 8);
    570     src += src_stride;
    571 
    572     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    573     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    574     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    575     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    576     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
    577                 res2, res3);
    578     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
    579                 res6, res7);
    580     SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
    581     SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
    582     LD_UB2(dst, 16, dst0, dst1);
    583     PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
    584     PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
    585     dst += dst_stride;
    586     LD_UB2(dst, 16, dst2, dst3);
    587     PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
    588     PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
    589     dst += dst_stride;
    590   }
    591 }
    592 
    593 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
    594                                               int32_t src_stride, uint8_t *dst,
    595                                               int32_t dst_stride,
    596                                               int8_t *filter, int32_t height) {
    597   uint32_t loop_cnt;
    598   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    599   v16u8 filt0, dst0, dst1, dst2, dst3;
    600   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    601   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
    602 
    603   mask = LD_SB(&mc_filt_mask_arr[0]);
    604 
    605   /* rearranging filter */
    606   filt = LD_UH(filter);
    607   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    608 
    609   for (loop_cnt = height; loop_cnt--;) {
    610     LD_SB4(src, 16, src0, src2, src4, src6);
    611     src7 = LD_SB(src + 56);
    612     SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
    613     src += src_stride;
    614 
    615     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    616     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    617     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    618     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    619     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    620                 out2, out3);
    621     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    622                 out6, out7);
    623     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
    624     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
    625     LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
    626     PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
    627     PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
    628     PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
    629     PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
    630     dst += dst_stride;
    631   }
    632 }
    633 
    634 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
    635                                  uint8_t *dst, ptrdiff_t dst_stride,
    636                                  const int16_t *filter_x, int x_step_q4,
    637                                  const int16_t *filter_y, int y_step_q4, int w,
    638                                  int h) {
    639   int8_t cnt, filt_hor[8];
    640 
    641   assert(x_step_q4 == 16);
    642   assert(((const int32_t *)filter_x)[1] != 0x800000);
    643 
    644   for (cnt = 0; cnt < 8; ++cnt) {
    645     filt_hor[cnt] = filter_x[cnt];
    646   }
    647 
    648   if (((const int32_t *)filter_x)[0] == 0) {
    649     switch (w) {
    650       case 4:
    651         common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
    652                                          (int32_t)dst_stride, &filt_hor[3], h);
    653         break;
    654       case 8:
    655         common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
    656                                          (int32_t)dst_stride, &filt_hor[3], h);
    657         break;
    658       case 16:
    659         common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
    660                                           (int32_t)dst_stride, &filt_hor[3], h);
    661         break;
    662       case 32:
    663         common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
    664                                           (int32_t)dst_stride, &filt_hor[3], h);
    665         break;
    666       case 64:
    667         common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
    668                                           (int32_t)dst_stride, &filt_hor[3], h);
    669         break;
    670       default:
    671         vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
    672                                   x_step_q4, filter_y, y_step_q4, w, h);
    673         break;
    674     }
    675   } else {
    676     switch (w) {
    677       case 4:
    678         common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
    679                                          (int32_t)dst_stride, filt_hor, h);
    680         break;
    681       case 8:
    682         common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
    683                                          (int32_t)dst_stride, filt_hor, h);
    684         break;
    685       case 16:
    686         common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
    687                                           (int32_t)dst_stride, filt_hor, h);
    688         break;
    689       case 32:
    690         common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
    691                                           (int32_t)dst_stride, filt_hor, h);
    692         break;
    693       case 64:
    694         common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
    695                                           (int32_t)dst_stride, filt_hor, h);
    696         break;
    697       default:
    698         vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
    699                                   x_step_q4, filter_y, y_step_q4, w, h);
    700         break;
    701     }
    702   }
    703 }
    704