Home | History | Annotate | Download | only in msa
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp8_rtcd.h"
     12 #include "vpx_ports/mem.h"
     13 #include "vp8/common/filter.h"
     14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
     15 
     16 DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) = {
     17   { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 },
     18   { 48, 80 },  { 32, 96 }, { 16, 112 }
     19 };
     20 
     21 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
     22   /* 8 width cases */
     23   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
     24   /* 4 width cases */
     25   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
     26   /* 4 width cases */
     27   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
     28 };
     29 
     30 static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
     31                                  uint8_t *RESTRICT dst, int32_t dst_stride,
     32                                  const int8_t *filter) {
     33   v16i8 src0, src1, src2, src3, mask;
     34   v16u8 filt0, vec0, vec1, res0, res1;
     35   v8u16 vec2, vec3, filt;
     36 
     37   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
     38 
     39   filt = LD_UH(filter);
     40   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
     41 
     42   LD_SB4(src, src_stride, src0, src1, src2, src3);
     43   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
     44   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
     45   SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
     46   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
     47   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
     48 }
     49 
     50 static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
     51                                  uint8_t *RESTRICT dst, int32_t dst_stride,
     52                                  const int8_t *filter) {
     53   v16u8 vec0, vec1, vec2, vec3, filt0;
     54   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
     55   v16i8 res0, res1, res2, res3;
     56   v8u16 vec4, vec5, vec6, vec7, filt;
     57 
     58   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
     59 
     60   filt = LD_UH(filter);
     61   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
     62 
     63   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
     64   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
     65   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
     66   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
     67               vec6, vec7);
     68   SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
     69   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
     70               res3);
     71   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
     72   dst += (4 * dst_stride);
     73   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
     74 }
     75 
     76 static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
     77                                 uint8_t *RESTRICT dst, int32_t dst_stride,
     78                                 const int8_t *filter, int32_t height) {
     79   if (4 == height) {
     80     common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
     81   } else if (8 == height) {
     82     common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
     83   }
     84 }
     85 
     86 static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
     87                                  uint8_t *RESTRICT dst, int32_t dst_stride,
     88                                  const int8_t *filter) {
     89   v16u8 filt0;
     90   v16i8 src0, src1, src2, src3, mask;
     91   v8u16 vec0, vec1, vec2, vec3, filt;
     92 
     93   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
     94 
     95   filt = LD_UH(filter);
     96   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
     97 
     98   LD_SB4(src, src_stride, src0, src1, src2, src3);
     99   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    100   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    101   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    102               vec2, vec3);
    103   SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    104   PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
    105   ST8x4_UB(src0, src1, dst, dst_stride);
    106 }
    107 
    108 static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
    109                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    110                                      const int8_t *filter, int32_t height) {
    111   v16u8 filt0;
    112   v16i8 src0, src1, src2, src3, mask, out0, out1;
    113   v8u16 vec0, vec1, vec2, vec3, filt;
    114 
    115   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    116 
    117   filt = LD_UH(filter);
    118   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    119 
    120   LD_SB4(src, src_stride, src0, src1, src2, src3);
    121   src += (4 * src_stride);
    122 
    123   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    124   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    125   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    126               vec2, vec3);
    127   SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    128 
    129   LD_SB4(src, src_stride, src0, src1, src2, src3);
    130   src += (4 * src_stride);
    131 
    132   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    133   ST8x4_UB(out0, out1, dst, dst_stride);
    134   dst += (4 * dst_stride);
    135 
    136   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    137   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    138   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    139               vec2, vec3);
    140   SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    141   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    142   ST8x4_UB(out0, out1, dst, dst_stride);
    143   dst += (4 * dst_stride);
    144 
    145   if (16 == height) {
    146     LD_SB4(src, src_stride, src0, src1, src2, src3);
    147     src += (4 * src_stride);
    148 
    149     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    150     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    151     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    152                 vec2, vec3);
    153     SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    154     LD_SB4(src, src_stride, src0, src1, src2, src3);
    155     src += (4 * src_stride);
    156 
    157     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    158     ST8x4_UB(out0, out1, dst, dst_stride);
    159 
    160     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
    161     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
    162     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
    163                 vec2, vec3);
    164     SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
    165     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
    166     ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
    167   }
    168 }
    169 
    170 static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    171                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    172                                 const int8_t *filter, int32_t height) {
    173   if (4 == height) {
    174     common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
    175   } else {
    176     common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
    177   }
    178 }
    179 
    180 static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    181                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    182                                  const int8_t *filter, int32_t height) {
    183   uint32_t loop_cnt;
    184   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    185   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    186   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
    187 
    188   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    189 
    190   loop_cnt = (height >> 2) - 1;
    191 
    192   filt = LD_UH(filter);
    193   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
    194 
    195   LD_SB4(src, src_stride, src0, src2, src4, src6);
    196   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    197   src += (4 * src_stride);
    198 
    199   VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    200   VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    201   VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    202   VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    203   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    204               out2, out3);
    205   DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    206               out6, out7);
    207   SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    208   SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
    209   PCKEV_ST_SB(out0, out1, dst);
    210   dst += dst_stride;
    211   PCKEV_ST_SB(out2, out3, dst);
    212   dst += dst_stride;
    213   PCKEV_ST_SB(out4, out5, dst);
    214   dst += dst_stride;
    215   PCKEV_ST_SB(out6, out7, dst);
    216   dst += dst_stride;
    217 
    218   for (; loop_cnt--;) {
    219     LD_SB4(src, src_stride, src0, src2, src4, src6);
    220     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    221     src += (4 * src_stride);
    222 
    223     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
    224     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
    225     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
    226     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
    227     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
    228                 out2, out3);
    229     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
    230                 out6, out7);
    231     SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
    232     SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
    233     PCKEV_ST_SB(out0, out1, dst);
    234     dst += dst_stride;
    235     PCKEV_ST_SB(out2, out3, dst);
    236     dst += dst_stride;
    237     PCKEV_ST_SB(out4, out5, dst);
    238     dst += dst_stride;
    239     PCKEV_ST_SB(out6, out7, dst);
    240     dst += dst_stride;
    241   }
    242 }
    243 
    244 static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    245                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    246                                  const int8_t *filter) {
    247   v16i8 src0, src1, src2, src3, src4;
    248   v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
    249   v16u8 filt0;
    250   v8i16 filt;
    251   v8u16 tmp0, tmp1;
    252 
    253   filt = LD_SH(filter);
    254   filt0 = (v16u8)__msa_splati_h(filt, 0);
    255 
    256   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    257   src += (5 * src_stride);
    258 
    259   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
    260              src32_r, src43_r);
    261   ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
    262   DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
    263   SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
    264   src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
    265   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
    266 }
    267 
    268 static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
    269                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    270                                  const int8_t *filter) {
    271   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    272   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
    273   v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
    274   v8u16 tmp0, tmp1, tmp2, tmp3;
    275   v16u8 filt0;
    276   v8i16 filt;
    277 
    278   filt = LD_SH(filter);
    279   filt0 = (v16u8)__msa_splati_h(filt, 0);
    280 
    281   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    282   src += (8 * src_stride);
    283 
    284   src8 = LD_SB(src);
    285   src += src_stride;
    286 
    287   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
    288              src32_r, src43_r);
    289   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
    290              src76_r, src87_r);
    291   ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
    292              src76_r, src2110, src4332, src6554, src8776);
    293   DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
    294               tmp0, tmp1, tmp2, tmp3);
    295   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    296   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
    297   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
    298   ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
    299 }
    300 
    301 static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    302                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    303                                 const int8_t *filter, int32_t height) {
    304   if (4 == height) {
    305     common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
    306   } else if (8 == height) {
    307     common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
    308   }
    309 }
    310 
    311 static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    312                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    313                                  const int8_t *filter) {
    314   v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
    315   v16i8 out0, out1;
    316   v8u16 tmp0, tmp1, tmp2, tmp3;
    317   v8i16 filt;
    318 
    319   filt = LD_SH(filter);
    320   filt0 = (v16u8)__msa_splati_h(filt, 0);
    321 
    322   LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
    323   ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
    324   ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
    325   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
    326               tmp2, tmp3);
    327   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    328   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    329   ST8x4_UB(out0, out1, dst, dst_stride);
    330 }
    331 
    332 static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
    333                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    334                                      const int8_t *filter, int32_t height) {
    335   uint32_t loop_cnt;
    336   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
    337   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
    338   v16i8 out0, out1;
    339   v8u16 tmp0, tmp1, tmp2, tmp3;
    340   v8i16 filt;
    341 
    342   filt = LD_SH(filter);
    343   filt0 = (v16u8)__msa_splati_h(filt, 0);
    344 
    345   src0 = LD_UB(src);
    346   src += src_stride;
    347 
    348   for (loop_cnt = (height >> 3); loop_cnt--;) {
    349     LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
    350     src += (8 * src_stride);
    351 
    352     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
    353                vec3);
    354     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
    355                vec7);
    356     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
    357                 tmp2, tmp3);
    358     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    359     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    360     ST8x4_UB(out0, out1, dst, dst_stride);
    361     dst += (4 * dst_stride);
    362 
    363     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
    364                 tmp2, tmp3);
    365     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    366     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    367     ST8x4_UB(out0, out1, dst, dst_stride);
    368     dst += (4 * dst_stride);
    369 
    370     src0 = src8;
    371   }
    372 }
    373 
    374 static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    375                                 uint8_t *RESTRICT dst, int32_t dst_stride,
    376                                 const int8_t *filter, int32_t height) {
    377   if (4 == height) {
    378     common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
    379   } else {
    380     common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
    381   }
    382 }
    383 
    384 static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    385                                  uint8_t *RESTRICT dst, int32_t dst_stride,
    386                                  const int8_t *filter, int32_t height) {
    387   uint32_t loop_cnt;
    388   v16u8 src0, src1, src2, src3, src4;
    389   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
    390   v8u16 tmp0, tmp1, tmp2, tmp3;
    391   v8i16 filt;
    392 
    393   filt = LD_SH(filter);
    394   filt0 = (v16u8)__msa_splati_h(filt, 0);
    395 
    396   src0 = LD_UB(src);
    397   src += src_stride;
    398 
    399   for (loop_cnt = (height >> 2); loop_cnt--;) {
    400     LD_UB4(src, src_stride, src1, src2, src3, src4);
    401     src += (4 * src_stride);
    402 
    403     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
    404     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
    405     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
    406     SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
    407     PCKEV_ST_SB(tmp0, tmp1, dst);
    408     dst += dst_stride;
    409 
    410     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
    411     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
    412     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
    413     SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
    414     PCKEV_ST_SB(tmp2, tmp3, dst);
    415     dst += dst_stride;
    416 
    417     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
    418     SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
    419     PCKEV_ST_SB(tmp0, tmp1, dst);
    420     dst += dst_stride;
    421 
    422     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
    423     SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
    424     PCKEV_ST_SB(tmp2, tmp3, dst);
    425     dst += dst_stride;
    426 
    427     src0 = src4;
    428   }
    429 }
    430 
    431 static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    432                                       uint8_t *RESTRICT dst, int32_t dst_stride,
    433                                       const int8_t *filter_horiz,
    434                                       const int8_t *filter_vert) {
    435   v16i8 src0, src1, src2, src3, src4, mask;
    436   v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
    437   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
    438 
    439   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
    440 
    441   filt = LD_UH(filter_horiz);
    442   filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
    443   filt = LD_UH(filter_vert);
    444   filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
    445 
    446   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    447   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    448   hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    449   hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    450   hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
    451   hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
    452 
    453   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    454   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
    455   SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
    456   PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
    457   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    458 }
    459 
    460 static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
    461                                       uint8_t *RESTRICT dst, int32_t dst_stride,
    462                                       const int8_t *filter_horiz,
    463                                       const int8_t *filter_vert) {
    464   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
    465   v16i8 res0, res1, res2, res3;
    466   v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
    467   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
    468   v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
    469 
    470   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
    471 
    472   filt = LD_UH(filter_horiz);
    473   filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
    474   filt = LD_UH(filter_vert);
    475   filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
    476 
    477   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
    478   src += (8 * src_stride);
    479   src8 = LD_SB(src);
    480 
    481   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    482   hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    483   hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
    484   hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
    485   hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
    486   SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
    487              hz_out3, hz_out5, 8);
    488   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
    489 
    490   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    491   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
    492   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
    493               vec5, vec6, vec7);
    494   SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
    495   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
    496               res3);
    497   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
    498   dst += (4 * dst_stride);
    499   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
    500 }
    501 
    502 static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    503                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    504                                      const int8_t *filter_horiz,
    505                                      const int8_t *filter_vert,
    506                                      int32_t height) {
    507   if (4 == height) {
    508     common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
    509                               filter_vert);
    510   } else if (8 == height) {
    511     common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
    512                               filter_vert);
    513   }
    514 }
    515 
    516 static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    517                                       uint8_t *RESTRICT dst, int32_t dst_stride,
    518                                       const int8_t *filter_horiz,
    519                                       const int8_t *filter_vert) {
    520   v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
    521   v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
    522   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
    523   v8i16 filt;
    524 
    525   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    526 
    527   filt = LD_SH(filter_horiz);
    528   filt_hz = (v16u8)__msa_splati_h(filt, 0);
    529   filt = LD_SH(filter_vert);
    530   filt_vt = (v16u8)__msa_splati_h(filt, 0);
    531 
    532   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
    533 
    534   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    535   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    536   vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    537   tmp0 = __msa_dotp_u_h(vec0, filt_vt);
    538 
    539   hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
    540   vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    541   tmp1 = __msa_dotp_u_h(vec1, filt_vt);
    542 
    543   hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    544   vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    545   tmp2 = __msa_dotp_u_h(vec2, filt_vt);
    546 
    547   hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    548   vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    549   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
    550 
    551   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
    552   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
    553   ST8x4_UB(out0, out1, dst, dst_stride);
    554 }
    555 
    556 static void common_hv_2ht_2vt_8x8mult_msa(
    557     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
    558     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
    559     int32_t height) {
    560   uint32_t loop_cnt;
    561   v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
    562   v16u8 filt_hz, filt_vt, vec0;
    563   v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
    564   v8i16 filt;
    565 
    566   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    567 
    568   filt = LD_SH(filter_horiz);
    569   filt_hz = (v16u8)__msa_splati_h(filt, 0);
    570   filt = LD_SH(filter_vert);
    571   filt_vt = (v16u8)__msa_splati_h(filt, 0);
    572 
    573   src0 = LD_SB(src);
    574   src += src_stride;
    575 
    576   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    577 
    578   for (loop_cnt = (height >> 3); loop_cnt--;) {
    579     LD_SB4(src, src_stride, src1, src2, src3, src4);
    580     src += (4 * src_stride);
    581 
    582     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    583     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    584     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
    585 
    586     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
    587     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    588     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
    589 
    590     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
    591 
    592     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    593     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    594     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
    595 
    596     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    597     LD_SB4(src, src_stride, src1, src2, src3, src4);
    598     src += (4 * src_stride);
    599     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    600     tmp4 = __msa_dotp_u_h(vec0, filt_vt);
    601 
    602     SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
    603     PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
    604     ST8x4_UB(out0, out1, dst, dst_stride);
    605     dst += (4 * dst_stride);
    606 
    607     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    608     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    609     tmp5 = __msa_dotp_u_h(vec0, filt_vt);
    610 
    611     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
    612     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    613     tmp6 = __msa_dotp_u_h(vec0, filt_vt);
    614 
    615     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    616     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    617     tmp7 = __msa_dotp_u_h(vec0, filt_vt);
    618 
    619     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    620     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    621     tmp8 = __msa_dotp_u_h(vec0, filt_vt);
    622 
    623     SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
    624     PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
    625     ST8x4_UB(out0, out1, dst, dst_stride);
    626     dst += (4 * dst_stride);
    627   }
    628 }
    629 
    630 static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    631                                      uint8_t *RESTRICT dst, int32_t dst_stride,
    632                                      const int8_t *filter_horiz,
    633                                      const int8_t *filter_vert,
    634                                      int32_t height) {
    635   if (4 == height) {
    636     common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
    637                               filter_vert);
    638   } else {
    639     common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
    640                                   filter_horiz, filter_vert, height);
    641   }
    642 }
    643 
    644 static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
    645                                       uint8_t *RESTRICT dst, int32_t dst_stride,
    646                                       const int8_t *filter_horiz,
    647                                       const int8_t *filter_vert,
    648                                       int32_t height) {
    649   uint32_t loop_cnt;
    650   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
    651   v16u8 filt_hz, filt_vt, vec0, vec1;
    652   v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
    653   v8i16 filt;
    654 
    655   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
    656 
    657   /* rearranging filter */
    658   filt = LD_SH(filter_horiz);
    659   filt_hz = (v16u8)__msa_splati_h(filt, 0);
    660   filt = LD_SH(filter_vert);
    661   filt_vt = (v16u8)__msa_splati_h(filt, 0);
    662 
    663   LD_SB2(src, 8, src0, src1);
    664   src += src_stride;
    665 
    666   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    667   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    668 
    669   for (loop_cnt = (height >> 2); loop_cnt--;) {
    670     LD_SB4(src, src_stride, src0, src2, src4, src6);
    671     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
    672     src += (4 * src_stride);
    673 
    674     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
    675     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
    676     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    677     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    678     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
    679     PCKEV_ST_SB(tmp1, tmp2, dst);
    680     dst += dst_stride;
    681 
    682     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
    683     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
    684     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
    685     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    686     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
    687     PCKEV_ST_SB(tmp1, tmp2, dst);
    688     dst += dst_stride;
    689 
    690     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
    691     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, VP8_FILTER_SHIFT);
    692     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    693     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    694     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
    695     PCKEV_ST_SB(tmp1, tmp2, dst);
    696     dst += dst_stride;
    697 
    698     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, VP8_FILTER_SHIFT);
    699     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, VP8_FILTER_SHIFT);
    700     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
    701     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    702     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
    703     PCKEV_ST_SB(tmp1, tmp2, dst);
    704     dst += dst_stride;
    705   }
    706 }
    707 
    708 void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    709                                  int32_t xoffset, int32_t yoffset,
    710                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
    711   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    712   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    713 
    714   if (yoffset) {
    715     if (xoffset) {
    716       common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride, h_filter,
    717                                v_filter, 4);
    718     } else {
    719       common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
    720     }
    721   } else {
    722     if (xoffset) {
    723       common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
    724     } else {
    725       uint32_t tp0, tp1, tp2, tp3;
    726 
    727       LW4(src, src_stride, tp0, tp1, tp2, tp3);
    728       SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
    729     }
    730   }
    731 }
    732 
    733 void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
    734                                  int32_t xoffset, int32_t yoffset,
    735                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
    736   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    737   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    738 
    739   if (yoffset) {
    740     if (xoffset) {
    741       common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
    742                                v_filter, 4);
    743     } else {
    744       common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
    745     }
    746   } else {
    747     if (xoffset) {
    748       common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
    749     } else {
    750       vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
    751     }
    752   }
    753 }
    754 
    755 void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
    756                                  int32_t xoffset, int32_t yoffset,
    757                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
    758   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    759   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    760 
    761   if (yoffset) {
    762     if (xoffset) {
    763       common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
    764                                v_filter, 8);
    765     } else {
    766       common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
    767     }
    768   } else {
    769     if (xoffset) {
    770       common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
    771     } else {
    772       vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
    773     }
    774   }
    775 }
    776 
    777 void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
    778                                    int32_t xoffset, int32_t yoffset,
    779                                    uint8_t *RESTRICT dst, int32_t dst_stride) {
    780   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
    781   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
    782 
    783   if (yoffset) {
    784     if (xoffset) {
    785       common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, h_filter,
    786                                 v_filter, 16);
    787     } else {
    788       common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
    789     }
    790   } else {
    791     if (xoffset) {
    792       common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
    793     } else {
    794       vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
    795     }
    796   }
    797 }
    798