1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15 static void common_hv_8ht_8vt_and_aver_dst_4w_msa( 16 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 17 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 18 uint32_t loop_cnt; 19 uint32_t tp0, tp1, tp2, tp3; 20 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 21 v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res; 22 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 23 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 24 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; 25 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 26 27 mask0 = LD_UB(&mc_filt_mask_arr[16]); 28 src -= (3 + 3 * src_stride); 29 30 /* rearranging filter */ 31 filt = LD_SH(filter_horiz); 32 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 33 34 mask1 = mask0 + 2; 35 mask2 = mask0 + 4; 36 mask3 = mask0 + 6; 37 38 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 39 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 40 src += (7 * src_stride); 41 42 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 43 filt_hz1, filt_hz2, filt_hz3); 44 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 45 filt_hz1, filt_hz2, filt_hz3); 46 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 47 filt_hz1, filt_hz2, filt_hz3); 48 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 49 filt_hz1, filt_hz2, filt_hz3); 50 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); 51 52 filt = LD_SH(filter_vert); 53 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 54 55 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 56 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 57 58 for (loop_cnt = (height >> 2); loop_cnt--;) { 59 LD_SB4(src, src_stride, src7, src8, src9, src10); 60 XORI_B4_128_SB(src7, src8, src9, src10); 61 src += (4 * src_stride); 62 63 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 64 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 65 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, 66 filt_hz1, filt_hz2, filt_hz3); 67 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 68 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 69 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, 70 filt_vt2, filt_vt3); 71 72 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, 73 filt_hz1, filt_hz2, filt_hz3); 74 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); 75 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 76 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, 77 filt_vt2, filt_vt3); 78 79 SRARI_H2_SH(res0, res1, FILTER_BITS); 80 SAT_SH2_SH(res0, res1, 7); 81 res = PCKEV_XORI128_UB(res0, res1); 82 res = (v16u8)__msa_aver_u_b(res, dst0); 83 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); 84 dst += (4 * dst_stride); 85 86 hz_out5 = hz_out9; 87 vec0 = vec2; 88 vec1 = vec3; 89 vec2 = vec4; 90 } 91 } 92 93 static void common_hv_8ht_8vt_and_aver_dst_8w_msa( 94 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 95 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 96 uint32_t loop_cnt; 97 uint64_t tp0, tp1, tp2, tp3; 98 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 99 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 100 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 101 v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3; 102 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 103 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 104 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 105 106 mask0 = LD_UB(&mc_filt_mask_arr[0]); 107 src -= (3 + 3 * src_stride); 108 109 /* rearranging filter */ 110 filt = LD_SH(filter_horiz); 111 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 112 113 mask1 = mask0 + 2; 114 mask2 = mask0 + 4; 115 mask3 = mask0 + 6; 116 117 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 118 src += (7 * src_stride); 119 120 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 121 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 122 filt_hz1, filt_hz2, filt_hz3); 123 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 124 filt_hz1, filt_hz2, filt_hz3); 125 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 126 filt_hz1, filt_hz2, filt_hz3); 127 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 128 filt_hz1, filt_hz2, filt_hz3); 129 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 130 filt_hz1, filt_hz2, filt_hz3); 131 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 132 filt_hz1, filt_hz2, filt_hz3); 133 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 134 filt_hz1, filt_hz2, filt_hz3); 135 136 filt = LD_SH(filter_vert); 137 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 138 139 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 140 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 141 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 142 143 for (loop_cnt = (height >> 2); loop_cnt--;) { 144 LD_SB4(src, src_stride, src7, src8, src9, src10); 145 XORI_B4_128_SB(src7, src8, src9, src10); 146 src += (4 * src_stride); 147 148 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 149 INSERT_D2_UB(tp0, tp1, dst0); 150 INSERT_D2_UB(tp2, tp3, dst1); 151 152 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, 153 filt_hz1, filt_hz2, filt_hz3); 154 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 155 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 156 filt_vt2, filt_vt3); 157 158 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, 159 filt_hz1, filt_hz2, filt_hz3); 160 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 161 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 162 filt_vt2, filt_vt3); 163 164 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, 165 filt_hz1, filt_hz2, filt_hz3); 166 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 167 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, 168 filt_vt2, filt_vt3); 169 170 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 171 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 172 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); 173 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 174 filt_vt2, filt_vt3); 175 176 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 177 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 178 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, 179 dst_stride); 180 dst += (4 * dst_stride); 181 182 hz_out6 = hz_out10; 183 out0 = out2; 184 out1 = out3; 185 out2 = out8; 186 out4 = out6; 187 out5 = out7; 188 out6 = out9; 189 } 190 } 191 192 static void common_hv_8ht_8vt_and_aver_dst_16w_msa( 193 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 194 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 195 int32_t multiple8_cnt; 196 for (multiple8_cnt = 2; multiple8_cnt--;) { 197 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 198 filter_horiz, filter_vert, height); 199 src += 8; 200 dst += 8; 201 } 202 } 203 204 static void common_hv_8ht_8vt_and_aver_dst_32w_msa( 205 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 206 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 207 int32_t multiple8_cnt; 208 for (multiple8_cnt = 4; multiple8_cnt--;) { 209 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 210 filter_horiz, filter_vert, height); 211 src += 8; 212 dst += 8; 213 } 214 } 215 216 static void common_hv_8ht_8vt_and_aver_dst_64w_msa( 217 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 218 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 219 int32_t multiple8_cnt; 220 for (multiple8_cnt = 8; multiple8_cnt--;) { 221 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 222 filter_horiz, filter_vert, height); 223 src += 8; 224 dst += 8; 225 } 226 } 227 228 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( 229 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 230 int8_t *filter_horiz, int8_t *filter_vert) { 231 uint32_t tp0, tp1, tp2, tp3; 232 v16i8 src0, src1, src2, src3, src4, mask; 233 v16u8 filt_hz, filt_vt, vec0, vec1; 234 v16u8 dst0 = { 0 }, out; 235 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; 236 237 mask = LD_SB(&mc_filt_mask_arr[16]); 238 239 /* rearranging filter */ 240 filt = LD_UH(filter_horiz); 241 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 242 243 filt = LD_UH(filter_vert); 244 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 245 246 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 247 248 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 249 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 250 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 251 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 252 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 253 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 254 255 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 256 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 257 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 258 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 259 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 260 out = __msa_aver_u_b(out, dst0); 261 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 262 } 263 264 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( 265 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 266 int8_t *filter_horiz, int8_t *filter_vert) { 267 uint32_t tp0, tp1, tp2, tp3; 268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 269 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; 270 v16u8 dst0 = { 0 }, dst1 = { 0 }; 271 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 272 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; 273 v8i16 filt; 274 275 mask = LD_SB(&mc_filt_mask_arr[16]); 276 277 /* rearranging filter */ 278 filt = LD_SH(filter_horiz); 279 filt_hz = (v16u8)__msa_splati_h(filt, 0); 280 281 filt = LD_SH(filter_vert); 282 filt_vt = (v16u8)__msa_splati_h(filt, 0); 283 284 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 285 src += (8 * src_stride); 286 src8 = LD_SB(src); 287 288 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 289 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 290 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); 291 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); 292 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); 293 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, 294 hz_out3, hz_out5, 8); 295 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); 296 297 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 298 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 299 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 300 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 301 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 302 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 303 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, 304 tmp1, tmp2, tmp3); 305 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 306 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); 307 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 308 ST4x8_UB(res0, res1, dst, dst_stride); 309 } 310 311 static void common_hv_2ht_2vt_and_aver_dst_4w_msa( 312 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 313 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 314 if (4 == height) { 315 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 316 filter_horiz, filter_vert); 317 } else if (8 == height) { 318 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 319 filter_horiz, filter_vert); 320 } 321 } 322 323 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( 324 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 325 int8_t *filter_horiz, int8_t *filter_vert) { 326 uint64_t tp0, tp1, tp2, tp3; 327 v16i8 src0, src1, src2, src3, src4, mask; 328 v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3; 329 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 330 v8i16 filt; 331 332 mask = LD_SB(&mc_filt_mask_arr[0]); 333 334 /* rearranging filter */ 335 filt = LD_SH(filter_horiz); 336 filt_hz = (v16u8)__msa_splati_h(filt, 0); 337 338 filt = LD_SH(filter_vert); 339 filt_vt = (v16u8)__msa_splati_h(filt, 0); 340 341 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 342 src += (5 * src_stride); 343 344 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 345 INSERT_D2_UB(tp0, tp1, dst0); 346 INSERT_D2_UB(tp2, tp3, dst1); 347 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 348 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 349 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 350 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 351 352 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 353 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 354 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 355 356 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 357 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 358 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 359 360 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 361 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 362 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 363 364 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 365 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 366 } 367 368 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( 369 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 370 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 371 uint32_t loop_cnt; 372 uint64_t tp0, tp1, tp2, tp3; 373 v16i8 src0, src1, src2, src3, src4, mask; 374 v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 }; 375 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 376 v8i16 filt; 377 378 mask = LD_SB(&mc_filt_mask_arr[0]); 379 380 /* rearranging filter */ 381 filt = LD_SH(filter_horiz); 382 filt_hz = (v16u8)__msa_splati_h(filt, 0); 383 384 filt = LD_SH(filter_vert); 385 filt_vt = (v16u8)__msa_splati_h(filt, 0); 386 387 src0 = LD_SB(src); 388 src += src_stride; 389 390 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 391 392 for (loop_cnt = (height >> 2); loop_cnt--;) { 393 LD_SB4(src, src_stride, src1, src2, src3, src4); 394 src += (4 * src_stride); 395 396 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 397 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 398 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 399 400 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 401 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 402 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 403 404 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 405 406 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 407 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 408 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 409 410 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 411 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 412 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 413 414 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 415 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 416 INSERT_D2_UB(tp0, tp1, dst0); 417 INSERT_D2_UB(tp2, tp3, dst1); 418 PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 419 dst += (4 * dst_stride); 420 } 421 } 422 423 static void common_hv_2ht_2vt_and_aver_dst_8w_msa( 424 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 425 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 426 if (4 == height) { 427 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 428 filter_horiz, filter_vert); 429 } else { 430 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( 431 src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); 432 } 433 } 434 435 static void common_hv_2ht_2vt_and_aver_dst_16w_msa( 436 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 437 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 438 uint32_t loop_cnt; 439 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 440 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; 441 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 442 v8i16 filt; 443 444 mask = LD_SB(&mc_filt_mask_arr[0]); 445 446 /* rearranging filter */ 447 filt = LD_SH(filter_horiz); 448 filt_hz = (v16u8)__msa_splati_h(filt, 0); 449 450 filt = LD_SH(filter_vert); 451 filt_vt = (v16u8)__msa_splati_h(filt, 0); 452 453 LD_SB2(src, 8, src0, src1); 454 src += src_stride; 455 456 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 457 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 458 459 for (loop_cnt = (height >> 2); loop_cnt--;) { 460 LD_SB4(src, src_stride, src0, src2, src4, src6); 461 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 462 src += (4 * src_stride); 463 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 464 465 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 466 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 467 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 468 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 469 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 470 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 471 dst += dst_stride; 472 473 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 474 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 475 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 476 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 477 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 478 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); 479 dst += dst_stride; 480 481 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 482 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 483 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 484 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 485 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 486 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); 487 dst += dst_stride; 488 489 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 490 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 491 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 492 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 493 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 494 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); 495 dst += dst_stride; 496 } 497 } 498 499 static void common_hv_2ht_2vt_and_aver_dst_32w_msa( 500 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 501 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 502 int32_t multiple8_cnt; 503 for (multiple8_cnt = 2; multiple8_cnt--;) { 504 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 505 filter_horiz, filter_vert, height); 506 src += 16; 507 dst += 16; 508 } 509 } 510 511 static void common_hv_2ht_2vt_and_aver_dst_64w_msa( 512 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 513 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 514 int32_t multiple8_cnt; 515 for (multiple8_cnt = 4; multiple8_cnt--;) { 516 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 517 filter_horiz, filter_vert, height); 518 src += 16; 519 dst += 16; 520 } 521 } 522 523 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, 524 uint8_t *dst, ptrdiff_t dst_stride, 525 const InterpKernel *filter, int x0_q4, int x_step_q4, 526 int y0_q4, int y_step_q4, int w, int h) { 527 const int16_t *const filter_x = filter[x0_q4]; 528 const int16_t *const filter_y = filter[y0_q4]; 529 int8_t cnt, filt_hor[8], filt_ver[8]; 530 531 assert(x_step_q4 == 16); 532 assert(y_step_q4 == 16); 533 assert(((const int32_t *)filter_x)[1] != 0x800000); 534 assert(((const int32_t *)filter_y)[1] != 0x800000); 535 536 for (cnt = 0; cnt < 8; ++cnt) { 537 filt_hor[cnt] = filter_x[cnt]; 538 filt_ver[cnt] = filter_y[cnt]; 539 } 540 541 if (((const int32_t *)filter_x)[0] == 0 && 542 ((const int32_t *)filter_y)[0] == 0) { 543 switch (w) { 544 case 4: 545 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 546 (int32_t)dst_stride, &filt_hor[3], 547 &filt_ver[3], h); 548 break; 549 case 8: 550 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 551 (int32_t)dst_stride, &filt_hor[3], 552 &filt_ver[3], h); 553 break; 554 case 16: 555 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 556 (int32_t)dst_stride, 557 &filt_hor[3], &filt_ver[3], h); 558 break; 559 case 32: 560 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 561 (int32_t)dst_stride, 562 &filt_hor[3], &filt_ver[3], h); 563 break; 564 case 64: 565 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 566 (int32_t)dst_stride, 567 &filt_hor[3], &filt_ver[3], h); 568 break; 569 default: 570 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 571 x_step_q4, y0_q4, y_step_q4, w, h); 572 break; 573 } 574 } else if (((const int32_t *)filter_x)[0] == 0 || 575 ((const int32_t *)filter_y)[0] == 0) { 576 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 577 x_step_q4, y0_q4, y_step_q4, w, h); 578 } else { 579 switch (w) { 580 case 4: 581 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 582 (int32_t)dst_stride, filt_hor, 583 filt_ver, h); 584 break; 585 case 8: 586 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 587 (int32_t)dst_stride, filt_hor, 588 filt_ver, h); 589 break; 590 case 16: 591 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 592 (int32_t)dst_stride, filt_hor, 593 filt_ver, h); 594 break; 595 case 32: 596 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 597 (int32_t)dst_stride, filt_hor, 598 filt_ver, h); 599 break; 600 case 64: 601 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 602 (int32_t)dst_stride, filt_hor, 603 filt_ver, h); 604 break; 605 default: 606 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 607 x_step_q4, y0_q4, y_step_q4, w, h); 608 break; 609 } 610 } 611 } 612