1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, 16 uint8_t *dst, int32_t dst_stride, 17 int8_t *filter) { 18 v16u8 mask0, mask1, mask2, mask3, out; 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 20 v8i16 filt, out0, out1; 21 22 mask0 = LD_UB(&mc_filt_mask_arr[16]); 23 src -= 3; 24 25 /* rearranging filter */ 26 filt = LD_SH(filter); 27 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 28 29 mask1 = mask0 + 2; 30 mask2 = mask0 + 4; 31 mask3 = mask0 + 6; 32 33 LD_SB4(src, src_stride, src0, src1, src2, src3); 34 XORI_B4_128_SB(src0, src1, src2, src3); 35 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 36 filt0, filt1, filt2, filt3, out0, out1); 37 SRARI_H2_SH(out0, out1, FILTER_BITS); 38 SAT_SH2_SH(out0, out1, 7); 39 out = PCKEV_XORI128_UB(out0, out1); 40 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 41 } 42 43 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, 44 uint8_t *dst, int32_t dst_stride, 45 int8_t *filter) { 46 v16i8 filt0, filt1, filt2, filt3; 47 v16i8 src0, src1, src2, src3; 48 v16u8 mask0, mask1, mask2, mask3, out; 49 v8i16 filt, out0, out1, out2, out3; 50 51 mask0 = LD_UB(&mc_filt_mask_arr[16]); 52 src -= 3; 53 54 /* rearranging filter */ 55 filt = LD_SH(filter); 56 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 57 58 mask1 = mask0 + 2; 59 mask2 = mask0 + 4; 60 mask3 = mask0 + 6; 61 62 LD_SB4(src, src_stride, src0, src1, src2, src3); 63 XORI_B4_128_SB(src0, src1, src2, src3); 64 src += (4 * src_stride); 65 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 66 filt0, filt1, filt2, filt3, out0, out1); 67 LD_SB4(src, src_stride, src0, src1, src2, src3); 68 XORI_B4_128_SB(src0, src1, src2, src3); 69 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 70 filt0, filt1, filt2, filt3, out2, out3); 71 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 72 SAT_SH4_SH(out0, out1, out2, out3, 7); 73 out = PCKEV_XORI128_UB(out0, out1); 74 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 75 dst += (4 * dst_stride); 76 out = PCKEV_XORI128_UB(out2, out3); 77 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 78 } 79 80 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, 81 uint8_t *dst, int32_t dst_stride, 82 int8_t *filter, int32_t height) { 83 if (4 == height) { 84 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); 85 } else if (8 == height) { 86 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); 87 } 88 } 89 90 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, 91 uint8_t *dst, int32_t dst_stride, 92 int8_t *filter) { 93 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 94 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 95 v8i16 filt, out0, out1, out2, out3; 96 97 mask0 = LD_UB(&mc_filt_mask_arr[0]); 98 src -= 3; 99 100 /* rearranging filter */ 101 filt = LD_SH(filter); 102 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 103 104 mask1 = mask0 + 2; 105 mask2 = mask0 + 4; 106 mask3 = mask0 + 6; 107 108 LD_SB4(src, src_stride, src0, src1, src2, src3); 109 XORI_B4_128_SB(src0, src1, src2, src3); 110 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 111 filt0, filt1, filt2, filt3, out0, out1, out2, 112 out3); 113 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 114 SAT_SH4_SH(out0, out1, out2, out3, 7); 115 tmp0 = PCKEV_XORI128_UB(out0, out1); 116 tmp1 = PCKEV_XORI128_UB(out2, out3); 117 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 118 } 119 120 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 121 uint8_t *dst, int32_t dst_stride, 122 int8_t *filter, int32_t height) { 123 uint32_t loop_cnt; 124 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 125 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 126 v8i16 filt, out0, out1, out2, out3; 127 128 mask0 = LD_UB(&mc_filt_mask_arr[0]); 129 src -= 3; 130 131 /* rearranging filter */ 132 filt = LD_SH(filter); 133 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 134 135 mask1 = mask0 + 2; 136 mask2 = mask0 + 4; 137 mask3 = mask0 + 6; 138 139 for (loop_cnt = (height >> 2); loop_cnt--;) { 140 LD_SB4(src, src_stride, src0, src1, src2, src3); 141 XORI_B4_128_SB(src0, src1, src2, src3); 142 src += (4 * src_stride); 143 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 144 mask3, filt0, filt1, filt2, filt3, out0, out1, 145 out2, out3); 146 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 147 SAT_SH4_SH(out0, out1, out2, out3, 7); 148 tmp0 = PCKEV_XORI128_UB(out0, out1); 149 tmp1 = PCKEV_XORI128_UB(out2, out3); 150 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 151 dst += (4 * dst_stride); 152 } 153 } 154 155 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, 156 uint8_t *dst, int32_t dst_stride, 157 int8_t *filter, int32_t height) { 158 if (4 == height) { 159 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); 160 } else { 161 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 162 } 163 } 164 165 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, 166 uint8_t *dst, int32_t dst_stride, 167 int8_t *filter, int32_t height) { 168 uint32_t loop_cnt; 169 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 170 v16u8 mask0, mask1, mask2, mask3, out; 171 v8i16 filt, out0, out1, out2, out3; 172 173 mask0 = LD_UB(&mc_filt_mask_arr[0]); 174 src -= 3; 175 176 /* rearranging filter */ 177 filt = LD_SH(filter); 178 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 179 180 mask1 = mask0 + 2; 181 mask2 = mask0 + 4; 182 mask3 = mask0 + 6; 183 184 for (loop_cnt = (height >> 1); loop_cnt--;) { 185 LD_SB2(src, src_stride, src0, src2); 186 LD_SB2(src + 8, src_stride, src1, src3); 187 XORI_B4_128_SB(src0, src1, src2, src3); 188 src += (2 * src_stride); 189 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 190 mask3, filt0, filt1, filt2, filt3, out0, out1, 191 out2, out3); 192 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 193 SAT_SH4_SH(out0, out1, out2, out3, 7); 194 out = PCKEV_XORI128_UB(out0, out1); 195 ST_UB(out, dst); 196 dst += dst_stride; 197 out = PCKEV_XORI128_UB(out2, out3); 198 ST_UB(out, dst); 199 dst += dst_stride; 200 } 201 } 202 203 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, 204 uint8_t *dst, int32_t dst_stride, 205 int8_t *filter, int32_t height) { 206 uint32_t loop_cnt; 207 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 208 v16u8 mask0, mask1, mask2, mask3, out; 209 v8i16 filt, out0, out1, out2, out3; 210 211 mask0 = LD_UB(&mc_filt_mask_arr[0]); 212 src -= 3; 213 214 /* rearranging filter */ 215 filt = LD_SH(filter); 216 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 217 218 mask1 = mask0 + 2; 219 mask2 = mask0 + 4; 220 mask3 = mask0 + 6; 221 222 for (loop_cnt = (height >> 1); loop_cnt--;) { 223 src0 = LD_SB(src); 224 src2 = LD_SB(src + 16); 225 src3 = LD_SB(src + 24); 226 src1 = __msa_sldi_b(src2, src0, 8); 227 src += src_stride; 228 XORI_B4_128_SB(src0, src1, src2, src3); 229 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 230 mask3, filt0, filt1, filt2, filt3, out0, out1, 231 out2, out3); 232 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 233 SAT_SH4_SH(out0, out1, out2, out3, 7); 234 235 src0 = LD_SB(src); 236 src2 = LD_SB(src + 16); 237 src3 = LD_SB(src + 24); 238 src1 = __msa_sldi_b(src2, src0, 8); 239 src += src_stride; 240 241 out = PCKEV_XORI128_UB(out0, out1); 242 ST_UB(out, dst); 243 out = PCKEV_XORI128_UB(out2, out3); 244 ST_UB(out, dst + 16); 245 dst += dst_stride; 246 247 XORI_B4_128_SB(src0, src1, src2, src3); 248 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 249 mask3, filt0, filt1, filt2, filt3, out0, out1, 250 out2, out3); 251 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 252 SAT_SH4_SH(out0, out1, out2, out3, 7); 253 out = PCKEV_XORI128_UB(out0, out1); 254 ST_UB(out, dst); 255 out = PCKEV_XORI128_UB(out2, out3); 256 ST_UB(out, dst + 16); 257 dst += dst_stride; 258 } 259 } 260 261 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, 262 uint8_t *dst, int32_t dst_stride, 263 int8_t *filter, int32_t height) { 264 int32_t loop_cnt; 265 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 266 v16u8 mask0, mask1, mask2, mask3, out; 267 v8i16 filt, out0, out1, out2, out3; 268 269 mask0 = LD_UB(&mc_filt_mask_arr[0]); 270 src -= 3; 271 272 /* rearranging filter */ 273 filt = LD_SH(filter); 274 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 275 276 mask1 = mask0 + 2; 277 mask2 = mask0 + 4; 278 mask3 = mask0 + 6; 279 280 for (loop_cnt = height; loop_cnt--;) { 281 src0 = LD_SB(src); 282 src2 = LD_SB(src + 16); 283 src3 = LD_SB(src + 24); 284 src1 = __msa_sldi_b(src2, src0, 8); 285 286 XORI_B4_128_SB(src0, src1, src2, src3); 287 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 288 mask3, filt0, filt1, filt2, filt3, out0, out1, 289 out2, out3); 290 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 291 SAT_SH4_SH(out0, out1, out2, out3, 7); 292 out = PCKEV_XORI128_UB(out0, out1); 293 ST_UB(out, dst); 294 out = PCKEV_XORI128_UB(out2, out3); 295 ST_UB(out, dst + 16); 296 297 src0 = LD_SB(src + 32); 298 src2 = LD_SB(src + 48); 299 src3 = LD_SB(src + 56); 300 src1 = __msa_sldi_b(src2, src0, 8); 301 src += src_stride; 302 303 XORI_B4_128_SB(src0, src1, src2, src3); 304 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 305 mask3, filt0, filt1, filt2, filt3, out0, out1, 306 out2, out3); 307 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 308 SAT_SH4_SH(out0, out1, out2, out3, 7); 309 out = PCKEV_XORI128_UB(out0, out1); 310 ST_UB(out, dst + 32); 311 out = PCKEV_XORI128_UB(out2, out3); 312 ST_UB(out, dst + 48); 313 dst += dst_stride; 314 } 315 } 316 317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 318 uint8_t *dst, int32_t dst_stride, 319 int8_t *filter) { 320 v16i8 src0, src1, src2, src3, mask; 321 v16u8 filt0, vec0, vec1, res0, res1; 322 v8u16 vec2, vec3, filt; 323 324 mask = LD_SB(&mc_filt_mask_arr[16]); 325 326 /* rearranging filter */ 327 filt = LD_UH(filter); 328 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 329 330 LD_SB4(src, src_stride, src0, src1, src2, src3); 331 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 332 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 333 SRARI_H2_UH(vec2, vec3, FILTER_BITS); 334 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 335 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 336 } 337 338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 339 uint8_t *dst, int32_t dst_stride, 340 int8_t *filter) { 341 v16u8 vec0, vec1, vec2, vec3, filt0; 342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 343 v16i8 res0, res1, res2, res3; 344 v8u16 vec4, vec5, vec6, vec7, filt; 345 346 mask = LD_SB(&mc_filt_mask_arr[16]); 347 348 /* rearranging filter */ 349 filt = LD_UH(filter); 350 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 351 352 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 353 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 354 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 355 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 356 vec6, vec7); 357 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 358 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 359 res3); 360 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 361 dst += (4 * dst_stride); 362 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 363 } 364 365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, 366 uint8_t *dst, int32_t dst_stride, 367 int8_t *filter, int32_t height) { 368 if (4 == height) { 369 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 370 } else if (8 == height) { 371 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 372 } 373 } 374 375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 376 uint8_t *dst, int32_t dst_stride, 377 int8_t *filter) { 378 v16u8 filt0; 379 v16i8 src0, src1, src2, src3, mask; 380 v8u16 vec0, vec1, vec2, vec3, filt; 381 382 mask = LD_SB(&mc_filt_mask_arr[0]); 383 384 /* rearranging filter */ 385 filt = LD_UH(filter); 386 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 387 388 LD_SB4(src, src_stride, src0, src1, src2, src3); 389 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 390 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 391 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 392 vec2, vec3); 393 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 394 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 395 ST8x4_UB(src0, src1, dst, dst_stride); 396 } 397 398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 399 uint8_t *dst, int32_t dst_stride, 400 int8_t *filter, int32_t height) { 401 v16u8 filt0; 402 v16i8 src0, src1, src2, src3, mask, out0, out1; 403 v8u16 vec0, vec1, vec2, vec3, filt; 404 405 mask = LD_SB(&mc_filt_mask_arr[0]); 406 407 /* rearranging filter */ 408 filt = LD_UH(filter); 409 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 410 411 LD_SB4(src, src_stride, src0, src1, src2, src3); 412 src += (4 * src_stride); 413 414 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 415 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 417 vec2, vec3); 418 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 419 420 LD_SB4(src, src_stride, src0, src1, src2, src3); 421 src += (4 * src_stride); 422 423 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 424 ST8x4_UB(out0, out1, dst, dst_stride); 425 dst += (4 * dst_stride); 426 427 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 428 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 429 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 430 vec2, vec3); 431 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 432 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 433 ST8x4_UB(out0, out1, dst, dst_stride); 434 dst += (4 * dst_stride); 435 436 if (16 == height) { 437 LD_SB4(src, src_stride, src0, src1, src2, src3); 438 src += (4 * src_stride); 439 440 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 441 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 442 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 443 vec2, vec3); 444 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 445 LD_SB4(src, src_stride, src0, src1, src2, src3); 446 src += (4 * src_stride); 447 448 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 449 ST8x4_UB(out0, out1, dst, dst_stride); 450 451 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 452 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 453 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 454 vec2, vec3); 455 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 456 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 457 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); 458 } 459 } 460 461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, 462 uint8_t *dst, int32_t dst_stride, 463 int8_t *filter, int32_t height) { 464 if (4 == height) { 465 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 466 } else { 467 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 468 } 469 } 470 471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, 472 uint8_t *dst, int32_t dst_stride, 473 int8_t *filter, int32_t height) { 474 uint32_t loop_cnt; 475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 476 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 477 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 478 479 mask = LD_SB(&mc_filt_mask_arr[0]); 480 481 loop_cnt = (height >> 2) - 1; 482 483 /* rearranging filter */ 484 filt = LD_UH(filter); 485 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 486 487 LD_SB4(src, src_stride, src0, src2, src4, src6); 488 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 489 src += (4 * src_stride); 490 491 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 492 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 493 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 494 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 495 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 496 out2, out3); 497 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 498 out6, out7); 499 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 500 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 501 PCKEV_ST_SB(out0, out1, dst); 502 dst += dst_stride; 503 PCKEV_ST_SB(out2, out3, dst); 504 dst += dst_stride; 505 PCKEV_ST_SB(out4, out5, dst); 506 dst += dst_stride; 507 PCKEV_ST_SB(out6, out7, dst); 508 dst += dst_stride; 509 510 for (; loop_cnt--;) { 511 LD_SB4(src, src_stride, src0, src2, src4, src6); 512 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 513 src += (4 * src_stride); 514 515 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 516 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 517 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 518 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 519 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 520 out2, out3); 521 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 522 out6, out7); 523 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 524 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 525 PCKEV_ST_SB(out0, out1, dst); 526 dst += dst_stride; 527 PCKEV_ST_SB(out2, out3, dst); 528 dst += dst_stride; 529 PCKEV_ST_SB(out4, out5, dst); 530 dst += dst_stride; 531 PCKEV_ST_SB(out6, out7, dst); 532 dst += dst_stride; 533 } 534 } 535 536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, 537 uint8_t *dst, int32_t dst_stride, 538 int8_t *filter, int32_t height) { 539 uint32_t loop_cnt; 540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 541 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 542 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 543 544 mask = LD_SB(&mc_filt_mask_arr[0]); 545 546 /* rearranging filter */ 547 filt = LD_UH(filter); 548 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 549 550 for (loop_cnt = height >> 1; loop_cnt--;) { 551 src0 = LD_SB(src); 552 src2 = LD_SB(src + 16); 553 src3 = LD_SB(src + 24); 554 src1 = __msa_sldi_b(src2, src0, 8); 555 src += src_stride; 556 src4 = LD_SB(src); 557 src6 = LD_SB(src + 16); 558 src7 = LD_SB(src + 24); 559 src5 = __msa_sldi_b(src6, src4, 8); 560 src += src_stride; 561 562 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 563 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 564 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 565 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 566 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 567 out2, out3); 568 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 569 out6, out7); 570 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 571 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 572 PCKEV_ST_SB(out0, out1, dst); 573 PCKEV_ST_SB(out2, out3, dst + 16); 574 dst += dst_stride; 575 PCKEV_ST_SB(out4, out5, dst); 576 PCKEV_ST_SB(out6, out7, dst + 16); 577 dst += dst_stride; 578 } 579 } 580 581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, 582 uint8_t *dst, int32_t dst_stride, 583 int8_t *filter, int32_t height) { 584 uint32_t loop_cnt; 585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 586 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 587 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 588 589 mask = LD_SB(&mc_filt_mask_arr[0]); 590 591 /* rearranging filter */ 592 filt = LD_UH(filter); 593 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 594 595 for (loop_cnt = height; loop_cnt--;) { 596 src0 = LD_SB(src); 597 src2 = LD_SB(src + 16); 598 src4 = LD_SB(src + 32); 599 src6 = LD_SB(src + 48); 600 src7 = LD_SB(src + 56); 601 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); 602 src += src_stride; 603 604 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 605 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 606 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 607 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 608 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 609 out2, out3); 610 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 611 out6, out7); 612 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 613 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 614 PCKEV_ST_SB(out0, out1, dst); 615 PCKEV_ST_SB(out2, out3, dst + 16); 616 PCKEV_ST_SB(out4, out5, dst + 32); 617 PCKEV_ST_SB(out6, out7, dst + 48); 618 dst += dst_stride; 619 } 620 } 621 622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 623 uint8_t *dst, ptrdiff_t dst_stride, 624 const InterpKernel *filter, int x0_q4, 625 int x_step_q4, int y0_q4, int y_step_q4, int w, 626 int h) { 627 const int16_t *const filter_x = filter[x0_q4]; 628 int8_t cnt, filt_hor[8]; 629 630 assert(x_step_q4 == 16); 631 assert(((const int32_t *)filter_x)[1] != 0x800000); 632 633 for (cnt = 0; cnt < 8; ++cnt) { 634 filt_hor[cnt] = filter_x[cnt]; 635 } 636 637 if (((const int32_t *)filter_x)[0] == 0) { 638 switch (w) { 639 case 4: 640 common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 641 &filt_hor[3], h); 642 break; 643 case 8: 644 common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 645 &filt_hor[3], h); 646 break; 647 case 16: 648 common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 649 &filt_hor[3], h); 650 break; 651 case 32: 652 common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 653 &filt_hor[3], h); 654 break; 655 case 64: 656 common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 657 &filt_hor[3], h); 658 break; 659 default: 660 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, 661 x_step_q4, y0_q4, y_step_q4, w, h); 662 break; 663 } 664 } else { 665 switch (w) { 666 case 4: 667 common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 668 filt_hor, h); 669 break; 670 case 8: 671 common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 672 filt_hor, h); 673 break; 674 case 16: 675 common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 676 filt_hor, h); 677 break; 678 case 32: 679 common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 680 filt_hor, h); 681 break; 682 case 64: 683 common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 684 filt_hor, h); 685 break; 686 default: 687 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, 688 x_step_q4, y0_q4, y_step_q4, w, h); 689 break; 690 } 691 } 692 } 693