1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, 16 int32_t src_stride, uint8_t *dst, 17 int32_t dst_stride, 18 int8_t *filter) { 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 20 v16u8 dst0, dst1, dst2, dst3, res2, res3; 21 v16u8 mask0, mask1, mask2, mask3; 22 v8i16 filt, res0, res1; 23 24 mask0 = LD_UB(&mc_filt_mask_arr[16]); 25 src -= 3; 26 27 /* rearranging filter */ 28 filt = LD_SH(filter); 29 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 30 31 mask1 = mask0 + 2; 32 mask2 = mask0 + 4; 33 mask3 = mask0 + 6; 34 35 LD_SB4(src, src_stride, src0, src1, src2, src3); 36 XORI_B4_128_SB(src0, src1, src2, src3); 37 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 38 filt0, filt1, filt2, filt3, res0, res1); 39 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 40 SRARI_H2_SH(res0, res1, FILTER_BITS); 41 SAT_SH2_SH(res0, res1, 7); 42 PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); 43 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 44 XORI_B2_128_UB(res2, res3); 45 AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); 46 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 47 } 48 49 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, 50 int32_t src_stride, uint8_t *dst, 51 int32_t dst_stride, 52 int8_t *filter) { 53 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 54 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; 55 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 56 v8i16 filt, vec0, vec1, vec2, vec3; 57 58 mask0 = LD_UB(&mc_filt_mask_arr[16]); 59 src -= 3; 60 61 /* rearranging filter */ 62 filt = LD_SH(filter); 63 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 64 65 mask1 = mask0 + 2; 66 mask2 = mask0 + 4; 67 mask3 = mask0 + 6; 68 69 LD_SB4(src, src_stride, src0, src1, src2, src3); 70 XORI_B4_128_SB(src0, src1, src2, src3); 71 src += (4 * src_stride); 72 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 73 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 74 filt0, filt1, filt2, filt3, vec0, vec1); 75 LD_SB4(src, src_stride, src0, src1, src2, src3); 76 XORI_B4_128_SB(src0, src1, src2, src3); 77 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 78 filt0, filt1, filt2, filt3, vec2, vec3); 79 SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); 80 SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); 81 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, 82 res3); 83 ILVR_D2_UB(res1, res0, res3, res2, res0, res2); 84 XORI_B2_128_UB(res0, res2); 85 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, 86 dst6); 87 ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); 88 AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); 89 ST4x8_UB(res0, res2, dst, dst_stride); 90 } 91 92 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, 93 int32_t src_stride, uint8_t *dst, 94 int32_t dst_stride, int8_t *filter, 95 int32_t height) { 96 if (4 == height) { 97 common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); 98 } else if (8 == height) { 99 common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); 100 } 101 } 102 103 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, 104 int32_t src_stride, uint8_t *dst, 105 int32_t dst_stride, int8_t *filter, 106 int32_t height) { 107 int32_t loop_cnt; 108 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 109 v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; 110 v8i16 filt, out0, out1, out2, out3; 111 112 mask0 = LD_UB(&mc_filt_mask_arr[0]); 113 src -= 3; 114 115 /* rearranging filter */ 116 filt = LD_SH(filter); 117 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 118 119 mask1 = mask0 + 2; 120 mask2 = mask0 + 4; 121 mask3 = mask0 + 6; 122 123 for (loop_cnt = (height >> 2); loop_cnt--;) { 124 LD_SB4(src, src_stride, src0, src1, src2, src3); 125 XORI_B4_128_SB(src0, src1, src2, src3); 126 src += (4 * src_stride); 127 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 128 mask3, filt0, filt1, filt2, filt3, out0, out1, 129 out2, out3); 130 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 131 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 132 SAT_SH4_SH(out0, out1, out2, out3, 7); 133 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, 134 dst_stride); 135 dst += (4 * dst_stride); 136 } 137 } 138 139 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, 140 int32_t src_stride, uint8_t *dst, 141 int32_t dst_stride, 142 int8_t *filter, int32_t height) { 143 int32_t loop_cnt; 144 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 145 v16u8 mask0, mask1, mask2, mask3, dst0, dst1; 146 v8i16 filt, out0, out1, out2, out3; 147 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 148 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 149 150 mask0 = LD_UB(&mc_filt_mask_arr[0]); 151 src -= 3; 152 153 /* rearranging filter */ 154 filt = LD_SH(filter); 155 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 156 157 mask1 = mask0 + 2; 158 mask2 = mask0 + 4; 159 mask3 = mask0 + 6; 160 161 for (loop_cnt = height >> 1; loop_cnt--;) { 162 LD_SB2(src, src_stride, src0, src2); 163 LD_SB2(src + 8, src_stride, src1, src3); 164 src += (2 * src_stride); 165 166 XORI_B4_128_SB(src0, src1, src2, src3); 167 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); 168 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); 169 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 170 vec14); 171 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 172 vec15); 173 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 174 vec2, vec3); 175 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 176 vec9, vec10, vec11); 177 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, 178 vec2, vec3); 179 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, 180 vec9, vec10, vec11); 181 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, 182 out2, out3); 183 LD_UB2(dst, dst_stride, dst0, dst1); 184 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 185 SAT_SH4_SH(out0, out1, out2, out3, 7); 186 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); 187 dst += dst_stride; 188 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); 189 dst += dst_stride; 190 } 191 } 192 193 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, 194 int32_t src_stride, uint8_t *dst, 195 int32_t dst_stride, 196 int8_t *filter, int32_t height) { 197 uint32_t loop_cnt; 198 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 199 v16u8 dst1, dst2, mask0, mask1, mask2, mask3; 200 v8i16 filt, out0, out1, out2, out3; 201 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 202 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 203 204 mask0 = LD_UB(&mc_filt_mask_arr[0]); 205 src -= 3; 206 207 /* rearranging filter */ 208 filt = LD_SH(filter); 209 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 210 211 mask1 = mask0 + 2; 212 mask2 = mask0 + 4; 213 mask3 = mask0 + 6; 214 215 for (loop_cnt = height; loop_cnt--;) { 216 src0 = LD_SB(src); 217 src2 = LD_SB(src + 16); 218 src3 = LD_SB(src + 24); 219 src1 = __msa_sldi_b(src2, src0, 8); 220 src += src_stride; 221 222 XORI_B4_128_SB(src0, src1, src2, src3); 223 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); 224 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); 225 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 226 vec14); 227 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 228 vec15); 229 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 230 vec2, vec3); 231 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 232 vec9, vec10, vec11); 233 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, 234 vec2, vec3); 235 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, 236 vec9, vec10, vec11); 237 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, 238 out2, out3); 239 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 240 SAT_SH4_SH(out0, out1, out2, out3, 7); 241 LD_UB2(dst, 16, dst1, dst2); 242 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); 243 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); 244 dst += dst_stride; 245 } 246 } 247 248 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, 249 int32_t src_stride, uint8_t *dst, 250 int32_t dst_stride, 251 int8_t *filter, int32_t height) { 252 uint32_t loop_cnt, cnt; 253 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 254 v16u8 dst1, dst2, mask0, mask1, mask2, mask3; 255 v8i16 filt, out0, out1, out2, out3; 256 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 257 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 258 259 mask0 = LD_UB(&mc_filt_mask_arr[0]); 260 src -= 3; 261 262 /* rearranging filter */ 263 filt = LD_SH(filter); 264 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 265 266 mask1 = mask0 + 2; 267 mask2 = mask0 + 4; 268 mask3 = mask0 + 6; 269 270 for (loop_cnt = height; loop_cnt--;) { 271 for (cnt = 0; cnt < 2; ++cnt) { 272 src0 = LD_SB(&src[cnt << 5]); 273 src2 = LD_SB(&src[16 + (cnt << 5)]); 274 src3 = LD_SB(&src[24 + (cnt << 5)]); 275 src1 = __msa_sldi_b(src2, src0, 8); 276 277 XORI_B4_128_SB(src0, src1, src2, src3); 278 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, 279 vec12); 280 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, 281 vec13); 282 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 283 vec14); 284 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 285 vec15); 286 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, 287 vec1, vec2, vec3); 288 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 289 vec9, vec10, vec11); 290 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, 291 vec1, vec2, vec3); 292 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, 293 vec9, vec10, vec11); 294 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, 295 out2, out3); 296 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 297 SAT_SH4_SH(out0, out1, out2, out3, 7); 298 LD_UB2(&dst[cnt << 5], 16, dst1, dst2); 299 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); 300 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); 301 } 302 303 src += src_stride; 304 dst += dst_stride; 305 } 306 } 307 308 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, 309 int32_t src_stride, uint8_t *dst, 310 int32_t dst_stride, 311 int8_t *filter) { 312 v16i8 src0, src1, src2, src3, mask; 313 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; 314 v8u16 vec2, vec3, filt; 315 316 mask = LD_SB(&mc_filt_mask_arr[16]); 317 318 /* rearranging filter */ 319 filt = LD_UH(filter); 320 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 321 322 LD_SB4(src, src_stride, src0, src1, src2, src3); 323 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 324 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 325 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 326 SRARI_H2_UH(vec2, vec3, FILTER_BITS); 327 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 328 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 329 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); 330 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 331 } 332 333 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, 334 int32_t src_stride, uint8_t *dst, 335 int32_t dst_stride, 336 int8_t *filter) { 337 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 338 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; 339 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 340 v8u16 vec4, vec5, vec6, vec7, filt; 341 342 mask = LD_SB(&mc_filt_mask_arr[16]); 343 344 /* rearranging filter */ 345 filt = LD_UH(filter); 346 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 347 348 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 349 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 350 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 351 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 352 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 353 vec6, vec7); 354 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 355 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 356 res3); 357 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, 358 dst6); 359 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, 360 res3); 361 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 362 dst += (4 * dst_stride); 363 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 364 } 365 366 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, 367 int32_t src_stride, uint8_t *dst, 368 int32_t dst_stride, int8_t *filter, 369 int32_t height) { 370 if (4 == height) { 371 common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); 372 } else if (8 == height) { 373 common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); 374 } 375 } 376 377 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, 378 int32_t src_stride, uint8_t *dst, 379 int32_t dst_stride, 380 int8_t *filter) { 381 v16i8 src0, src1, src2, src3, mask; 382 v16u8 filt0, dst0, dst1, dst2, dst3; 383 v8u16 vec0, vec1, vec2, vec3, filt; 384 385 mask = LD_SB(&mc_filt_mask_arr[0]); 386 387 /* rearranging filter */ 388 filt = LD_UH(filter); 389 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 390 391 LD_SB4(src, src_stride, src0, src1, src2, src3); 392 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 393 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 394 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 395 vec2, vec3); 396 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 397 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 398 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, 399 dst_stride); 400 } 401 402 static void common_hz_2t_and_aver_dst_8x8mult_msa( 403 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 404 int8_t *filter, int32_t height) { 405 v16i8 src0, src1, src2, src3, mask; 406 v16u8 filt0, dst0, dst1, dst2, dst3; 407 v8u16 vec0, vec1, vec2, vec3, filt; 408 409 mask = LD_SB(&mc_filt_mask_arr[0]); 410 411 /* rearranging filter */ 412 filt = LD_UH(filter); 413 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 414 415 LD_SB4(src, src_stride, src0, src1, src2, src3); 416 src += (4 * src_stride); 417 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 418 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 419 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 420 vec2, vec3); 421 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 422 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 423 LD_SB4(src, src_stride, src0, src1, src2, src3); 424 src += (4 * src_stride); 425 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, 426 dst_stride); 427 dst += (4 * dst_stride); 428 429 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 430 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 431 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 432 vec2, vec3); 433 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 434 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 435 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, 436 dst_stride); 437 dst += (4 * dst_stride); 438 439 if (16 == height) { 440 LD_SB4(src, src_stride, src0, src1, src2, src3); 441 src += (4 * src_stride); 442 443 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 444 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 445 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 446 vec2, vec3); 447 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 448 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 449 LD_SB4(src, src_stride, src0, src1, src2, src3); 450 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, 451 dst_stride); 452 dst += (4 * dst_stride); 453 454 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 455 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 456 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 457 vec2, vec3); 458 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 459 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 460 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, 461 dst_stride); 462 } 463 } 464 465 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, 466 int32_t src_stride, uint8_t *dst, 467 int32_t dst_stride, int8_t *filter, 468 int32_t height) { 469 if (4 == height) { 470 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); 471 } else { 472 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, 473 filter, height); 474 } 475 } 476 477 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, 478 int32_t src_stride, uint8_t *dst, 479 int32_t dst_stride, 480 int8_t *filter, int32_t height) { 481 uint32_t loop_cnt; 482 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 483 v16u8 filt0, dst0, dst1, dst2, dst3; 484 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 485 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; 486 487 mask = LD_SB(&mc_filt_mask_arr[0]); 488 489 /* rearranging filter */ 490 filt = LD_UH(filter); 491 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 492 493 LD_SB4(src, src_stride, src0, src2, src4, src6); 494 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 495 src += (4 * src_stride); 496 497 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 498 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 499 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 500 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 501 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 502 res2, res3); 503 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 504 res6, res7); 505 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 506 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 507 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 508 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 509 dst += dst_stride; 510 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 511 dst += dst_stride; 512 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 513 dst += dst_stride; 514 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 515 dst += dst_stride; 516 517 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 518 LD_SB4(src, src_stride, src0, src2, src4, src6); 519 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 520 src += (4 * src_stride); 521 522 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 523 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 524 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 525 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 526 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 527 res2, res3); 528 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 529 res6, res7); 530 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 531 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 532 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 533 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 534 dst += dst_stride; 535 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 536 dst += dst_stride; 537 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 538 dst += dst_stride; 539 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 540 dst += dst_stride; 541 } 542 } 543 544 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, 545 int32_t src_stride, uint8_t *dst, 546 int32_t dst_stride, 547 int8_t *filter, int32_t height) { 548 uint32_t loop_cnt; 549 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 550 v16u8 filt0, dst0, dst1, dst2, dst3; 551 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 552 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; 553 554 mask = LD_SB(&mc_filt_mask_arr[0]); 555 556 /* rearranging filter */ 557 filt = LD_UH(filter); 558 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 559 560 for (loop_cnt = (height >> 1); loop_cnt--;) { 561 src0 = LD_SB(src); 562 src2 = LD_SB(src + 16); 563 src3 = LD_SB(src + 24); 564 src1 = __msa_sldi_b(src2, src0, 8); 565 src += src_stride; 566 src4 = LD_SB(src); 567 src6 = LD_SB(src + 16); 568 src7 = LD_SB(src + 24); 569 src5 = __msa_sldi_b(src6, src4, 8); 570 src += src_stride; 571 572 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 573 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 574 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 575 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 576 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 577 res2, res3); 578 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 579 res6, res7); 580 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 581 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 582 LD_UB2(dst, 16, dst0, dst1); 583 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 584 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); 585 dst += dst_stride; 586 LD_UB2(dst, 16, dst2, dst3); 587 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 588 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); 589 dst += dst_stride; 590 } 591 } 592 593 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, 594 int32_t src_stride, uint8_t *dst, 595 int32_t dst_stride, 596 int8_t *filter, int32_t height) { 597 uint32_t loop_cnt; 598 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 599 v16u8 filt0, dst0, dst1, dst2, dst3; 600 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 601 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 602 603 mask = LD_SB(&mc_filt_mask_arr[0]); 604 605 /* rearranging filter */ 606 filt = LD_UH(filter); 607 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 608 609 for (loop_cnt = height; loop_cnt--;) { 610 LD_SB4(src, 16, src0, src2, src4, src6); 611 src7 = LD_SB(src + 56); 612 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); 613 src += src_stride; 614 615 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 616 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 617 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 618 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 619 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 620 out2, out3); 621 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 622 out6, out7); 623 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 624 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 625 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); 626 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); 627 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); 628 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); 629 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); 630 dst += dst_stride; 631 } 632 } 633 634 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 635 uint8_t *dst, ptrdiff_t dst_stride, 636 const int16_t *filter_x, int x_step_q4, 637 const int16_t *filter_y, int y_step_q4, int w, 638 int h) { 639 int8_t cnt, filt_hor[8]; 640 641 assert(x_step_q4 == 16); 642 assert(((const int32_t *)filter_x)[1] != 0x800000); 643 644 for (cnt = 0; cnt < 8; ++cnt) { 645 filt_hor[cnt] = filter_x[cnt]; 646 } 647 648 if (((const int32_t *)filter_x)[0] == 0) { 649 switch (w) { 650 case 4: 651 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 652 (int32_t)dst_stride, &filt_hor[3], h); 653 break; 654 case 8: 655 common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 656 (int32_t)dst_stride, &filt_hor[3], h); 657 break; 658 case 16: 659 common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 660 (int32_t)dst_stride, &filt_hor[3], h); 661 break; 662 case 32: 663 common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 664 (int32_t)dst_stride, &filt_hor[3], h); 665 break; 666 case 64: 667 common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 668 (int32_t)dst_stride, &filt_hor[3], h); 669 break; 670 default: 671 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, 672 x_step_q4, filter_y, y_step_q4, w, h); 673 break; 674 } 675 } else { 676 switch (w) { 677 case 4: 678 common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 679 (int32_t)dst_stride, filt_hor, h); 680 break; 681 case 8: 682 common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 683 (int32_t)dst_stride, filt_hor, h); 684 break; 685 case 16: 686 common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 687 (int32_t)dst_stride, filt_hor, h); 688 break; 689 case 32: 690 common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 691 (int32_t)dst_stride, filt_hor, h); 692 break; 693 case 64: 694 common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 695 (int32_t)dst_stride, filt_hor, h); 696 break; 697 default: 698 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, 699 x_step_q4, filter_y, y_step_q4, w, h); 700 break; 701 } 702 } 703 } 704