1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vp8_rtcd.h" 12 #include "vpx_ports/mem.h" 13 #include "vp8/common/filter.h" 14 #include "vp8/common/mips/msa/vp8_macros_msa.h" 15 16 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = { 17 { 0, -6, 123, 12, -1, 0, 0, 0 }, 18 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ 19 { 0, -9, 93, 50, -6, 0, 0, 0 }, 20 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ 21 { 0, -6, 50, 93, -9, 0, 0, 0 }, 22 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ 23 { 0, -1, 12, 123, -6, 0, 0, 0 }, 24 }; 25 26 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { 27 /* 8 width cases */ 28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29 /* 4 width cases */ 30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 31 /* 4 width cases */ 32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 33 }; 34 35 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ 36 filt_h2) \ 37 ({ \ 38 v16i8 vec0_m, vec1_m, vec2_m; \ 39 v8i16 hz_out_m; \ 40 \ 41 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ 42 vec0_m, vec1_m, vec2_m); \ 43 hz_out_m = \ 44 DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ 45 \ 46 hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ 47 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 48 \ 49 hz_out_m; \ 50 }) 51 52 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ 53 mask2, filt0, filt1, filt2, out0, out1) \ 54 { \ 55 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ 56 \ 57 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 58 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 59 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 60 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 61 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 62 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 63 } 64 65 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ 66 mask2, filt0, filt1, filt2, out0, out1, \ 67 out2, out3) \ 68 { \ 69 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 70 \ 71 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 72 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 73 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 74 out0, out1, out2, out3); \ 75 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 76 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 77 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ 78 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ 79 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 80 out0, out1, out2, out3); \ 81 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ 82 out0, out1, out2, out3); \ 83 } 84 85 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ 86 ({ \ 87 v8i16 tmp0; \ 88 \ 89 tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ 90 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ 91 \ 92 tmp0; \ 93 }) 94 95 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ 96 ({ \ 97 v16i8 vec0_m, vec1_m; \ 98 v8i16 hz_out_m; \ 99 \ 100 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ 101 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ 102 \ 103 hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ 104 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 105 \ 106 hz_out_m; \ 107 }) 108 109 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ 110 filt0, filt1, out0, out1) \ 111 { \ 112 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 113 \ 114 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 115 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 116 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 117 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 118 } 119 120 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ 121 filt0, filt1, out0, out1, out2, out3) \ 122 { \ 123 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 124 \ 125 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 126 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 127 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 128 out0, out1, out2, out3); \ 129 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 130 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 131 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 132 out0, out1, out2, out3); \ 133 } 134 135 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 136 uint8_t *RESTRICT dst, int32_t dst_stride, 137 const int8_t *filter) { 138 v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 139 v16u8 mask0, mask1, mask2, out; 140 v8i16 filt, out0, out1; 141 142 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); 143 src -= 2; 144 145 filt = LD_SH(filter); 146 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 147 148 mask1 = mask0 + 2; 149 mask2 = mask0 + 4; 150 151 LD_SB4(src, src_stride, src0, src1, src2, src3); 152 XORI_B4_128_SB(src0, src1, src2, src3); 153 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, 154 filt1, filt2, out0, out1); 155 SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT); 156 SAT_SH2_SH(out0, out1, 7); 157 out = PCKEV_XORI128_UB(out0, out1); 158 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 159 } 160 161 static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 162 uint8_t *RESTRICT dst, int32_t dst_stride, 163 const int8_t *filter) { 164 v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 165 v16u8 mask0, mask1, mask2, out; 166 v8i16 filt, out0, out1, out2, out3; 167 168 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); 169 src -= 2; 170 171 filt = LD_SH(filter); 172 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 173 174 mask1 = mask0 + 2; 175 mask2 = mask0 + 4; 176 177 LD_SB4(src, src_stride, src0, src1, src2, src3); 178 XORI_B4_128_SB(src0, src1, src2, src3); 179 src += (4 * src_stride); 180 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, 181 filt1, filt2, out0, out1); 182 LD_SB4(src, src_stride, src0, src1, src2, src3); 183 XORI_B4_128_SB(src0, src1, src2, src3); 184 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, 185 filt1, filt2, out2, out3); 186 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 187 SAT_SH4_SH(out0, out1, out2, out3, 7); 188 out = PCKEV_XORI128_UB(out0, out1); 189 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 190 dst += (4 * dst_stride); 191 out = PCKEV_XORI128_UB(out2, out3); 192 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 193 } 194 195 static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 196 uint8_t *RESTRICT dst, int32_t dst_stride, 197 const int8_t *filter, int32_t height) { 198 if (4 == height) { 199 common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter); 200 } else if (8 == height) { 201 common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter); 202 } 203 } 204 205 static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 206 uint8_t *RESTRICT dst, int32_t dst_stride, 207 const int8_t *filter, int32_t height) { 208 uint32_t loop_cnt; 209 v16i8 src0, src1, src2, src3, filt0, filt1, filt2; 210 v16u8 mask0, mask1, mask2, tmp0, tmp1; 211 v8i16 filt, out0, out1, out2, out3; 212 213 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); 214 src -= 2; 215 216 filt = LD_SH(filter); 217 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 218 219 mask1 = mask0 + 2; 220 mask2 = mask0 + 4; 221 222 LD_SB4(src, src_stride, src0, src1, src2, src3); 223 XORI_B4_128_SB(src0, src1, src2, src3); 224 src += (4 * src_stride); 225 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, 226 filt1, filt2, out0, out1, out2, out3); 227 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 228 SAT_SH4_SH(out0, out1, out2, out3, 7); 229 tmp0 = PCKEV_XORI128_UB(out0, out1); 230 tmp1 = PCKEV_XORI128_UB(out2, out3); 231 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 232 dst += (4 * dst_stride); 233 234 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 235 LD_SB4(src, src_stride, src0, src1, src2, src3); 236 XORI_B4_128_SB(src0, src1, src2, src3); 237 src += (4 * src_stride); 238 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 239 filt0, filt1, filt2, out0, out1, out2, out3); 240 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 241 SAT_SH4_SH(out0, out1, out2, out3, 7); 242 tmp0 = PCKEV_XORI128_UB(out0, out1); 243 tmp1 = PCKEV_XORI128_UB(out2, out3); 244 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 245 dst += (4 * dst_stride); 246 } 247 } 248 249 static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 250 uint8_t *RESTRICT dst, int32_t dst_stride, 251 const int8_t *filter, int32_t height) { 252 uint32_t loop_cnt; 253 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; 254 v16u8 mask0, mask1, mask2, out; 255 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 256 257 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); 258 src -= 2; 259 260 filt = LD_SH(filter); 261 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 262 263 mask1 = mask0 + 2; 264 mask2 = mask0 + 4; 265 266 for (loop_cnt = (height >> 2); loop_cnt--;) { 267 LD_SB4(src, src_stride, src0, src2, src4, src6); 268 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 269 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 270 src += (4 * src_stride); 271 272 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 273 filt0, filt1, filt2, out0, out1, out2, out3); 274 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 275 filt0, filt1, filt2, out4, out5, out6, out7); 276 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 277 SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT); 278 SAT_SH4_SH(out0, out1, out2, out3, 7); 279 SAT_SH4_SH(out4, out5, out6, out7, 7); 280 out = PCKEV_XORI128_UB(out0, out1); 281 ST_UB(out, dst); 282 dst += dst_stride; 283 out = PCKEV_XORI128_UB(out2, out3); 284 ST_UB(out, dst); 285 dst += dst_stride; 286 out = PCKEV_XORI128_UB(out4, out5); 287 ST_UB(out, dst); 288 dst += dst_stride; 289 out = PCKEV_XORI128_UB(out6, out7); 290 ST_UB(out, dst); 291 dst += dst_stride; 292 } 293 } 294 295 static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 296 uint8_t *RESTRICT dst, int32_t dst_stride, 297 const int8_t *filter, int32_t height) { 298 uint32_t loop_cnt; 299 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 300 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 301 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; 302 v16u8 out; 303 v8i16 filt, out10, out32; 304 305 src -= (2 * src_stride); 306 307 filt = LD_SH(filter); 308 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 309 310 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 311 src += (5 * src_stride); 312 313 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 314 src32_r, src43_r); 315 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 316 XORI_B2_128_SB(src2110, src4332); 317 318 for (loop_cnt = (height >> 2); loop_cnt--;) { 319 LD_SB4(src, src_stride, src5, src6, src7, src8); 320 src += (4 * src_stride); 321 322 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 323 src76_r, src87_r); 324 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); 325 XORI_B2_128_SB(src6554, src8776); 326 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); 327 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); 328 SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT); 329 SAT_SH2_SH(out10, out32, 7); 330 out = PCKEV_XORI128_UB(out10, out32); 331 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 332 dst += (4 * dst_stride); 333 334 src2110 = src6554; 335 src4332 = src8776; 336 src4 = src8; 337 } 338 } 339 340 static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 341 uint8_t *RESTRICT dst, int32_t dst_stride, 342 const int8_t *filter, int32_t height) { 343 uint32_t loop_cnt; 344 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; 345 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 346 v16i8 src109_r, filt0, filt1, filt2; 347 v16u8 tmp0, tmp1; 348 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 349 350 src -= (2 * src_stride); 351 352 filt = LD_SH(filter); 353 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 354 355 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 356 src += (5 * src_stride); 357 358 XORI_B5_128_SB(src0, src1, src2, src3, src4); 359 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r, 360 src21_r, src43_r); 361 362 for (loop_cnt = (height >> 2); loop_cnt--;) { 363 LD_SB4(src, src_stride, src7, src8, src9, src10); 364 XORI_B4_128_SB(src7, src8, src9, src10); 365 src += (4 * src_stride); 366 367 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, 368 src87_r, src98_r, src109_r); 369 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); 370 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); 371 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); 372 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); 373 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); 374 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 375 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 376 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 377 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 378 dst += (4 * dst_stride); 379 380 src10_r = src76_r; 381 src32_r = src98_r; 382 src21_r = src87_r; 383 src43_r = src109_r; 384 src4 = src10; 385 } 386 } 387 388 static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 389 uint8_t *RESTRICT dst, int32_t dst_stride, 390 const int8_t *filter, int32_t height) { 391 uint32_t loop_cnt; 392 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 393 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; 394 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; 395 v16i8 src65_l, src87_l, filt0, filt1, filt2; 396 v16u8 tmp0, tmp1, tmp2, tmp3; 397 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; 398 399 src -= (2 * src_stride); 400 401 filt = LD_SH(filter); 402 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); 403 404 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 405 src += (5 * src_stride); 406 407 XORI_B5_128_SB(src0, src1, src2, src3, src4); 408 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r, 409 src43_r, src21_r); 410 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l, 411 src43_l, src21_l); 412 413 for (loop_cnt = (height >> 2); loop_cnt--;) { 414 LD_SB4(src, src_stride, src5, src6, src7, src8); 415 src += (4 * src_stride); 416 417 XORI_B4_128_SB(src5, src6, src7, src8); 418 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 419 src76_r, src87_r); 420 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, 421 src76_l, src87_l); 422 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); 423 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); 424 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); 425 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); 426 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 427 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 428 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 429 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 430 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); 431 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT); 432 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 433 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 434 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, 435 tmp0, tmp1, tmp2, tmp3); 436 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 437 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 438 dst += (4 * dst_stride); 439 440 src10_r = src54_r; 441 src32_r = src76_r; 442 src21_r = src65_r; 443 src43_r = src87_r; 444 src10_l = src54_l; 445 src32_l = src76_l; 446 src21_l = src65_l; 447 src43_l = src87_l; 448 src4 = src8; 449 } 450 } 451 452 static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 453 uint8_t *RESTRICT dst, int32_t dst_stride, 454 const int8_t *filter_horiz, 455 const int8_t *filter_vert, 456 int32_t height) { 457 uint32_t loop_cnt; 458 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 459 v16i8 filt_hz0, filt_hz1, filt_hz2; 460 v16u8 mask0, mask1, mask2, out; 461 v8i16 tmp0, tmp1; 462 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 463 v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; 464 465 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); 466 src -= (2 + 2 * src_stride); 467 468 filt = LD_SH(filter_horiz); 469 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 470 filt = LD_SH(filter_vert); 471 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 472 473 mask1 = mask0 + 2; 474 mask2 = mask0 + 4; 475 476 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 477 src += (5 * src_stride); 478 479 XORI_B5_128_SB(src0, src1, src2, src3, src4); 480 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, 481 filt_hz2); 482 hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, 483 filt_hz2); 484 hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 485 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, 486 filt_hz2); 487 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 488 489 for (loop_cnt = (height >> 2); loop_cnt--;) { 490 LD_SB2(src, src_stride, src5, src6); 491 src += (2 * src_stride); 492 493 XORI_B2_128_SB(src5, src6); 494 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, 495 filt_hz1, filt_hz2); 496 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); 497 498 LD_SB2(src, src_stride, src7, src8); 499 src += (2 * src_stride); 500 501 XORI_B2_128_SB(src7, src8); 502 hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, 503 filt_hz1, filt_hz2); 504 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 505 506 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 507 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 508 509 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 510 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); 511 512 SRARI_H2_SH(tmp0, tmp1, 7); 513 SAT_SH2_SH(tmp0, tmp1, 7); 514 out = PCKEV_XORI128_UB(tmp0, tmp1); 515 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 516 dst += (4 * dst_stride); 517 518 hz_out3 = hz_out7; 519 out0 = out2; 520 out1 = out3; 521 } 522 } 523 524 static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 525 uint8_t *RESTRICT dst, int32_t dst_stride, 526 const int8_t *filter_horiz, 527 const int8_t *filter_vert, 528 int32_t height) { 529 uint32_t loop_cnt; 530 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 531 v16i8 filt_hz0, filt_hz1, filt_hz2; 532 v16u8 mask0, mask1, mask2, vec0, vec1; 533 v8i16 filt, filt_vt0, filt_vt1, filt_vt2; 534 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 535 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 536 v8i16 tmp0, tmp1, tmp2, tmp3; 537 538 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); 539 src -= (2 + 2 * src_stride); 540 541 filt = LD_SH(filter_horiz); 542 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 543 544 mask1 = mask0 + 2; 545 mask2 = mask0 + 4; 546 547 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 548 src += (5 * src_stride); 549 550 XORI_B5_128_SB(src0, src1, src2, src3, src4); 551 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, 552 filt_hz2); 553 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, 554 filt_hz2); 555 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, 556 filt_hz2); 557 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, 558 filt_hz2); 559 hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, 560 filt_hz2); 561 562 filt = LD_SH(filter_vert); 563 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 564 565 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 566 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); 567 568 for (loop_cnt = (height >> 2); loop_cnt--;) { 569 LD_SB4(src, src_stride, src5, src6, src7, src8); 570 src += (4 * src_stride); 571 572 XORI_B4_128_SB(src5, src6, src7, src8); 573 hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 574 filt_hz1, filt_hz2); 575 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 576 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 577 578 hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 579 filt_hz1, filt_hz2); 580 out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5); 581 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 582 583 hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, 584 filt_hz1, filt_hz2); 585 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 586 tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); 587 588 hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, 589 filt_hz1, filt_hz2); 590 out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 591 tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); 592 593 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 594 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 595 vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 596 vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 597 ST8x4_UB(vec0, vec1, dst, dst_stride); 598 dst += (4 * dst_stride); 599 600 hz_out4 = hz_out8; 601 out0 = out2; 602 out1 = out7; 603 out3 = out5; 604 out4 = out6; 605 } 606 } 607 608 static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 609 uint8_t *RESTRICT dst, int32_t dst_stride, 610 const int8_t *filter_horiz, 611 const int8_t *filter_vert, 612 int32_t height) { 613 int32_t multiple8_cnt; 614 for (multiple8_cnt = 2; multiple8_cnt--;) { 615 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 616 filter_vert, height); 617 src += 8; 618 dst += 8; 619 } 620 } 621 622 static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 623 uint8_t *RESTRICT dst, int32_t dst_stride, 624 const int8_t *filter) { 625 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 626 v8i16 filt, out0, out1; 627 v16u8 out; 628 629 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]); 630 src -= 1; 631 632 filt = LD_SH(filter); 633 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 634 635 mask1 = mask0 + 2; 636 637 LD_SB4(src, src_stride, src0, src1, src2, src3); 638 XORI_B4_128_SB(src0, src1, src2, src3); 639 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, 640 out0, out1); 641 SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT); 642 SAT_SH2_SH(out0, out1, 7); 643 out = PCKEV_XORI128_UB(out0, out1); 644 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 645 } 646 647 static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 648 uint8_t *RESTRICT dst, int32_t dst_stride, 649 const int8_t *filter) { 650 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 651 v16u8 out; 652 v8i16 filt, out0, out1, out2, out3; 653 654 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]); 655 src -= 1; 656 657 filt = LD_SH(filter); 658 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 659 660 mask1 = mask0 + 2; 661 662 LD_SB4(src, src_stride, src0, src1, src2, src3); 663 src += (4 * src_stride); 664 665 XORI_B4_128_SB(src0, src1, src2, src3); 666 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, 667 out0, out1); 668 LD_SB4(src, src_stride, src0, src1, src2, src3); 669 XORI_B4_128_SB(src0, src1, src2, src3); 670 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, 671 out2, out3); 672 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 673 SAT_SH4_SH(out0, out1, out2, out3, 7); 674 out = PCKEV_XORI128_UB(out0, out1); 675 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 676 dst += (4 * dst_stride); 677 out = PCKEV_XORI128_UB(out2, out3); 678 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 679 } 680 681 static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 682 uint8_t *RESTRICT dst, int32_t dst_stride, 683 const int8_t *filter, int32_t height) { 684 if (4 == height) { 685 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 686 } else if (8 == height) { 687 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); 688 } 689 } 690 691 static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 692 uint8_t *RESTRICT dst, int32_t dst_stride, 693 const int8_t *filter, int32_t height) { 694 uint32_t loop_cnt; 695 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 696 v16u8 tmp0, tmp1; 697 v8i16 filt, out0, out1, out2, out3; 698 699 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); 700 src -= 1; 701 702 filt = LD_SH(filter); 703 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 704 705 mask1 = mask0 + 2; 706 707 for (loop_cnt = (height >> 2); loop_cnt--;) { 708 LD_SB4(src, src_stride, src0, src1, src2, src3); 709 src += (4 * src_stride); 710 711 XORI_B4_128_SB(src0, src1, src2, src3); 712 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 713 filt1, out0, out1, out2, out3); 714 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 715 SAT_SH4_SH(out0, out1, out2, out3, 7); 716 tmp0 = PCKEV_XORI128_UB(out0, out1); 717 tmp1 = PCKEV_XORI128_UB(out2, out3); 718 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 719 dst += (4 * dst_stride); 720 } 721 } 722 723 static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 724 uint8_t *RESTRICT dst, int32_t dst_stride, 725 const int8_t *filter, int32_t height) { 726 uint32_t loop_cnt; 727 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 728 v16i8 filt0, filt1, mask0, mask1; 729 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 730 v16u8 out; 731 732 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); 733 src -= 1; 734 735 filt = LD_SH(filter); 736 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 737 738 mask1 = mask0 + 2; 739 740 for (loop_cnt = (height >> 2); loop_cnt--;) { 741 LD_SB4(src, src_stride, src0, src2, src4, src6); 742 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 743 src += (4 * src_stride); 744 745 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 746 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 747 filt1, out0, out1, out2, out3); 748 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, 749 filt1, out4, out5, out6, out7); 750 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 751 SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT); 752 SAT_SH4_SH(out0, out1, out2, out3, 7); 753 SAT_SH4_SH(out4, out5, out6, out7, 7); 754 out = PCKEV_XORI128_UB(out0, out1); 755 ST_UB(out, dst); 756 dst += dst_stride; 757 out = PCKEV_XORI128_UB(out2, out3); 758 ST_UB(out, dst); 759 dst += dst_stride; 760 out = PCKEV_XORI128_UB(out4, out5); 761 ST_UB(out, dst); 762 dst += dst_stride; 763 out = PCKEV_XORI128_UB(out6, out7); 764 ST_UB(out, dst); 765 dst += dst_stride; 766 } 767 } 768 769 static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 770 uint8_t *RESTRICT dst, int32_t dst_stride, 771 const int8_t *filter, int32_t height) { 772 uint32_t loop_cnt; 773 v16i8 src0, src1, src2, src3, src4, src5; 774 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 775 v16i8 src2110, src4332, filt0, filt1; 776 v8i16 filt, out10, out32; 777 v16u8 out; 778 779 src -= src_stride; 780 781 filt = LD_SH(filter); 782 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 783 784 LD_SB3(src, src_stride, src0, src1, src2); 785 src += (3 * src_stride); 786 787 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 788 789 src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r); 790 src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128); 791 792 for (loop_cnt = (height >> 2); loop_cnt--;) { 793 LD_SB3(src, src_stride, src3, src4, src5); 794 src += (3 * src_stride); 795 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 796 src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r); 797 src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128); 798 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); 799 800 src2 = LD_SB(src); 801 src += (src_stride); 802 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); 803 src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r); 804 src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128); 805 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); 806 SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT); 807 SAT_SH2_SH(out10, out32, 7); 808 out = PCKEV_XORI128_UB(out10, out32); 809 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 810 dst += (4 * dst_stride); 811 } 812 } 813 814 static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 815 uint8_t *RESTRICT dst, int32_t dst_stride, 816 const int8_t *filter, int32_t height) { 817 uint32_t loop_cnt; 818 v16i8 src0, src1, src2, src7, src8, src9, src10; 819 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; 820 v16u8 tmp0, tmp1; 821 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 822 823 src -= src_stride; 824 825 filt = LD_SH(filter); 826 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 827 828 LD_SB3(src, src_stride, src0, src1, src2); 829 src += (3 * src_stride); 830 831 XORI_B3_128_SB(src0, src1, src2); 832 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 833 834 for (loop_cnt = (height >> 2); loop_cnt--;) { 835 LD_SB4(src, src_stride, src7, src8, src9, src10); 836 src += (4 * src_stride); 837 838 XORI_B4_128_SB(src7, src8, src9, src10); 839 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r, 840 src87_r, src98_r, src109_r); 841 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1); 842 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); 843 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); 844 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); 845 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); 846 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 847 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 848 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 849 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 850 dst += (4 * dst_stride); 851 852 src10_r = src98_r; 853 src21_r = src109_r; 854 src2 = src10; 855 } 856 } 857 858 static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 859 uint8_t *RESTRICT dst, int32_t dst_stride, 860 const int8_t *filter, int32_t height) { 861 uint32_t loop_cnt; 862 v16i8 src0, src1, src2, src3, src4, src5, src6; 863 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; 864 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; 865 v16u8 tmp0, tmp1, tmp2, tmp3; 866 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 867 868 src -= src_stride; 869 870 filt = LD_SH(filter); 871 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 872 873 LD_SB3(src, src_stride, src0, src1, src2); 874 src += (3 * src_stride); 875 876 XORI_B3_128_SB(src0, src1, src2); 877 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 878 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 879 880 for (loop_cnt = (height >> 2); loop_cnt--;) { 881 LD_SB4(src, src_stride, src3, src4, src5, src6); 882 src += (4 * src_stride); 883 884 XORI_B4_128_SB(src3, src4, src5, src6); 885 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r, 886 src54_r, src65_r); 887 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l, 888 src54_l, src65_l); 889 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); 890 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); 891 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1); 892 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1); 893 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); 894 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); 895 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); 896 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); 897 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); 898 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT); 899 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 900 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 901 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, 902 tmp0, tmp1, tmp2, tmp3); 903 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 904 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 905 dst += (4 * dst_stride); 906 907 src10_r = src54_r; 908 src21_r = src65_r; 909 src10_l = src54_l; 910 src21_l = src65_l; 911 src2 = src6; 912 } 913 } 914 915 static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 916 uint8_t *RESTRICT dst, int32_t dst_stride, 917 const int8_t *filter_horiz, 918 const int8_t *filter_vert, 919 int32_t height) { 920 uint32_t loop_cnt; 921 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; 922 v16u8 mask0, mask1, out; 923 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; 924 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; 925 926 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); 927 src -= (1 + 1 * src_stride); 928 929 filt = LD_SH(filter_horiz); 930 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 931 932 mask1 = mask0 + 2; 933 934 LD_SB3(src, src_stride, src0, src1, src2); 935 src += (3 * src_stride); 936 937 XORI_B3_128_SB(src0, src1, src2); 938 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); 939 hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); 940 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 941 942 filt = LD_SH(filter_vert); 943 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 944 945 for (loop_cnt = (height >> 2); loop_cnt--;) { 946 LD_SB4(src, src_stride, src3, src4, src5, src6); 947 src += (4 * src_stride); 948 949 XORI_B2_128_SB(src3, src4); 950 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); 951 hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8); 952 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); 953 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 954 955 XORI_B2_128_SB(src5, src6); 956 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); 957 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); 958 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 959 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 960 961 SRARI_H2_SH(tmp0, tmp1, 7); 962 SAT_SH2_SH(tmp0, tmp1, 7); 963 out = PCKEV_XORI128_UB(tmp0, tmp1); 964 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 965 dst += (4 * dst_stride); 966 967 hz_out1 = hz_out5; 968 vec0 = vec2; 969 } 970 } 971 972 static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 973 uint8_t *RESTRICT dst, int32_t dst_stride, 974 const int8_t *filter_horiz, 975 const int8_t *filter_vert, 976 int32_t height) { 977 uint32_t loop_cnt; 978 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; 979 v16u8 mask0, mask1, out0, out1; 980 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; 981 v8i16 hz_out0, hz_out1, hz_out2, hz_out3; 982 v8i16 vec0, vec1, vec2, vec3, vec4; 983 984 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); 985 src -= (1 + 1 * src_stride); 986 987 filt = LD_SH(filter_horiz); 988 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 989 990 mask1 = mask0 + 2; 991 992 LD_SB3(src, src_stride, src0, src1, src2); 993 src += (3 * src_stride); 994 995 XORI_B3_128_SB(src0, src1, src2); 996 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 997 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 998 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 999 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); 1000 1001 filt = LD_SH(filter_vert); 1002 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1003 1004 for (loop_cnt = (height >> 2); loop_cnt--;) { 1005 LD_SB4(src, src_stride, src3, src4, src5, src6); 1006 src += (4 * src_stride); 1007 1008 XORI_B4_128_SB(src3, src4, src5, src6); 1009 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 1010 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); 1011 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1012 1013 hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 1014 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3); 1015 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 1016 1017 hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 1018 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1019 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1); 1020 1021 hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 1022 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1); 1023 tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1024 1025 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1026 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1027 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 1028 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 1029 ST8x4_UB(out0, out1, dst, dst_stride); 1030 dst += (4 * dst_stride); 1031 1032 vec0 = vec4; 1033 vec2 = vec1; 1034 } 1035 } 1036 1037 static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1038 uint8_t *RESTRICT dst, int32_t dst_stride, 1039 const int8_t *filter_horiz, 1040 const int8_t *filter_vert, 1041 int32_t height) { 1042 int32_t multiple8_cnt; 1043 for (multiple8_cnt = 2; multiple8_cnt--;) { 1044 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 1045 filter_vert, height); 1046 src += 8; 1047 dst += 8; 1048 } 1049 } 1050 1051 static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1052 uint8_t *RESTRICT dst, int32_t dst_stride, 1053 const int8_t *filter_horiz, 1054 const int8_t *filter_vert, 1055 int32_t height) { 1056 uint32_t loop_cnt; 1057 v16i8 src0, src1, src2, src3, src4, src5, src6; 1058 v16i8 filt_hz0, filt_hz1, filt_hz2; 1059 v16u8 res0, res1, mask0, mask1, mask2; 1060 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; 1061 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; 1062 1063 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); 1064 src -= (2 + 1 * src_stride); 1065 1066 filt = LD_SH(filter_horiz); 1067 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 1068 1069 mask1 = mask0 + 2; 1070 mask2 = mask0 + 4; 1071 1072 LD_SB3(src, src_stride, src0, src1, src2); 1073 src += (3 * src_stride); 1074 1075 XORI_B3_128_SB(src0, src1, src2); 1076 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, 1077 filt_hz2); 1078 hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, 1079 filt_hz2); 1080 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1081 1082 filt = LD_SH(filter_vert); 1083 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1084 1085 for (loop_cnt = (height >> 2); loop_cnt--;) { 1086 LD_SB4(src, src_stride, src3, src4, src5, src6); 1087 src += (4 * src_stride); 1088 1089 XORI_B4_128_SB(src3, src4, src5, src6); 1090 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, 1091 filt_hz1, filt_hz2); 1092 hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8); 1093 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); 1094 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1095 1096 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, 1097 filt_hz1, filt_hz2); 1098 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); 1099 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 1100 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1101 1102 SRARI_H2_SH(tmp0, tmp1, 7); 1103 SAT_SH2_SH(tmp0, tmp1, 7); 1104 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 1105 XORI_B2_128_UB(res0, res1); 1106 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 1107 dst += (4 * dst_stride); 1108 1109 hz_out1 = hz_out5; 1110 vec0 = vec2; 1111 } 1112 } 1113 1114 static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1115 uint8_t *RESTRICT dst, int32_t dst_stride, 1116 const int8_t *filter_horiz, 1117 const int8_t *filter_vert, 1118 int32_t height) { 1119 uint32_t loop_cnt; 1120 v16i8 src0, src1, src2, src3, src4, src5, src6; 1121 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; 1122 v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; 1123 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; 1124 v16u8 out0, out1; 1125 1126 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); 1127 src -= (2 + src_stride); 1128 1129 filt = LD_SH(filter_horiz); 1130 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); 1131 1132 mask1 = mask0 + 2; 1133 mask2 = mask0 + 4; 1134 1135 LD_SB3(src, src_stride, src0, src1, src2); 1136 src += (3 * src_stride); 1137 1138 XORI_B3_128_SB(src0, src1, src2); 1139 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, 1140 filt_hz2); 1141 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, 1142 filt_hz2); 1143 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, 1144 filt_hz2); 1145 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); 1146 1147 filt = LD_SH(filter_vert); 1148 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); 1149 1150 for (loop_cnt = (height >> 2); loop_cnt--;) { 1151 LD_SB4(src, src_stride, src3, src4, src5, src6); 1152 src += (4 * src_stride); 1153 1154 XORI_B4_128_SB(src3, src4, src5, src6); 1155 1156 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 1157 filt_hz1, filt_hz2); 1158 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); 1159 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 1160 1161 hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 1162 filt_hz1, filt_hz2); 1163 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3); 1164 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 1165 1166 hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 1167 filt_hz1, filt_hz2); 1168 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1169 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1); 1170 1171 hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 1172 filt_hz1, filt_hz2); 1173 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2); 1174 tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 1175 1176 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1177 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1178 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 1179 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 1180 ST8x4_UB(out0, out1, dst, dst_stride); 1181 dst += (4 * dst_stride); 1182 } 1183 } 1184 1185 static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1186 uint8_t *RESTRICT dst, int32_t dst_stride, 1187 const int8_t *filter_horiz, 1188 const int8_t *filter_vert, 1189 int32_t height) { 1190 int32_t multiple8_cnt; 1191 for (multiple8_cnt = 2; multiple8_cnt--;) { 1192 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 1193 filter_vert, height); 1194 src += 8; 1195 dst += 8; 1196 } 1197 } 1198 1199 static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1200 uint8_t *RESTRICT dst, int32_t dst_stride, 1201 const int8_t *filter_horiz, 1202 const int8_t *filter_vert, 1203 int32_t height) { 1204 uint32_t loop_cnt; 1205 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1206 v16i8 filt_hz0, filt_hz1, mask0, mask1; 1207 v16u8 out; 1208 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1209 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3; 1210 v8i16 filt, filt_vt0, filt_vt1, filt_vt2; 1211 1212 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]); 1213 1214 src -= (1 + 2 * src_stride); 1215 1216 filt = LD_SH(filter_horiz); 1217 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1218 1219 mask1 = mask0 + 2; 1220 1221 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1222 src += (5 * src_stride); 1223 1224 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1225 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); 1226 hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); 1227 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); 1228 hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 1229 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1230 1231 filt = LD_SH(filter_vert); 1232 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 1233 1234 for (loop_cnt = (height >> 2); loop_cnt--;) { 1235 LD_SB4(src, src_stride, src5, src6, src7, src8); 1236 XORI_B4_128_SB(src5, src6, src7, src8); 1237 src += (4 * src_stride); 1238 1239 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); 1240 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); 1241 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 1242 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 1243 1244 hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); 1245 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 1246 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 1247 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); 1248 1249 SRARI_H2_SH(tmp0, tmp1, 7); 1250 SAT_SH2_SH(tmp0, tmp1, 7); 1251 out = PCKEV_XORI128_UB(tmp0, tmp1); 1252 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 1253 dst += (4 * dst_stride); 1254 1255 hz_out3 = hz_out7; 1256 out0 = out2; 1257 out1 = out3; 1258 } 1259 } 1260 1261 static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1262 uint8_t *RESTRICT dst, int32_t dst_stride, 1263 const int8_t *filter_horiz, 1264 const int8_t *filter_vert, 1265 int32_t height) { 1266 uint32_t loop_cnt; 1267 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1268 v16i8 filt_hz0, filt_hz1, mask0, mask1; 1269 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; 1270 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 1271 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 1272 v16u8 vec0, vec1; 1273 1274 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); 1275 src -= (1 + 2 * src_stride); 1276 1277 filt = LD_SH(filter_horiz); 1278 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); 1279 1280 mask1 = mask0 + 2; 1281 1282 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 1283 src += (5 * src_stride); 1284 1285 XORI_B5_128_SB(src0, src1, src2, src3, src4); 1286 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 1287 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 1288 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 1289 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 1290 hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 1291 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 1292 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); 1293 1294 filt = LD_SH(filter_vert); 1295 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); 1296 1297 for (loop_cnt = (height >> 2); loop_cnt--;) { 1298 LD_SB4(src, src_stride, src5, src6, src7, src8); 1299 src += (4 * src_stride); 1300 1301 XORI_B4_128_SB(src5, src6, src7, src8); 1302 1303 hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 1304 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 1305 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 1306 1307 hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 1308 out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5); 1309 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 1310 1311 hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); 1312 out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 1313 tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); 1314 1315 hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); 1316 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 1317 tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); 1318 1319 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1320 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 1321 vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 1322 vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 1323 ST8x4_UB(vec0, vec1, dst, dst_stride); 1324 dst += (4 * dst_stride); 1325 1326 hz_out4 = hz_out8; 1327 out0 = out2; 1328 out1 = out6; 1329 out3 = out5; 1330 out4 = out7; 1331 } 1332 } 1333 1334 static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 1335 uint8_t *RESTRICT dst, int32_t dst_stride, 1336 const int8_t *filter_horiz, 1337 const int8_t *filter_vert, 1338 int32_t height) { 1339 int32_t multiple8_cnt; 1340 for (multiple8_cnt = 2; multiple8_cnt--;) { 1341 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 1342 filter_vert, height); 1343 src += 8; 1344 dst += 8; 1345 } 1346 } 1347 1348 void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 1349 int32_t xoffset, int32_t yoffset, 1350 uint8_t *RESTRICT dst, int32_t dst_stride) { 1351 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; 1352 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; 1353 1354 if (yoffset) { 1355 if (xoffset) { 1356 switch (xoffset) { 1357 case 2: 1358 case 4: 1359 case 6: 1360 switch (yoffset) { 1361 case 2: 1362 case 4: 1363 case 6: 1364 common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride, 1365 h_filter, v_filter, 4); 1366 break; 1367 1368 case 1: 1369 case 3: 1370 case 5: 1371 case 7: 1372 common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride, 1373 h_filter, v_filter + 1, 4); 1374 break; 1375 } 1376 break; 1377 1378 case 1: 1379 case 3: 1380 case 5: 1381 case 7: 1382 switch (yoffset) { 1383 case 2: 1384 case 4: 1385 case 6: 1386 common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride, 1387 h_filter + 1, v_filter, 4); 1388 break; 1389 1390 case 1: 1391 case 3: 1392 case 5: 1393 case 7: 1394 common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride, 1395 h_filter + 1, v_filter + 1, 4); 1396 break; 1397 } 1398 break; 1399 } 1400 } else { 1401 switch (yoffset) { 1402 case 2: 1403 case 4: 1404 case 6: 1405 common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4); 1406 break; 1407 1408 case 1: 1409 case 3: 1410 case 5: 1411 case 7: 1412 common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1, 1413 4); 1414 break; 1415 } 1416 } 1417 } else { 1418 switch (xoffset) { 1419 case 0: { 1420 uint32_t tp0, tp1, tp2, tp3; 1421 1422 LW4(src, src_stride, tp0, tp1, tp2, tp3); 1423 SW4(tp0, tp1, tp2, tp3, dst, dst_stride); 1424 break; 1425 } 1426 case 2: 1427 case 4: 1428 case 6: 1429 common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4); 1430 break; 1431 1432 case 1: 1433 case 3: 1434 case 5: 1435 case 7: 1436 common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4); 1437 break; 1438 } 1439 } 1440 } 1441 1442 void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 1443 int32_t xoffset, int32_t yoffset, 1444 uint8_t *RESTRICT dst, int32_t dst_stride) { 1445 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; 1446 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; 1447 1448 if (yoffset) { 1449 if (xoffset) { 1450 switch (xoffset) { 1451 case 2: 1452 case 4: 1453 case 6: 1454 switch (yoffset) { 1455 case 2: 1456 case 4: 1457 case 6: 1458 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, 1459 h_filter, v_filter, 4); 1460 break; 1461 1462 case 1: 1463 case 3: 1464 case 5: 1465 case 7: 1466 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, 1467 h_filter, v_filter + 1, 4); 1468 break; 1469 } 1470 break; 1471 1472 case 1: 1473 case 3: 1474 case 5: 1475 case 7: 1476 switch (yoffset) { 1477 case 2: 1478 case 4: 1479 case 6: 1480 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, 1481 h_filter + 1, v_filter, 4); 1482 break; 1483 1484 case 1: 1485 case 3: 1486 case 5: 1487 case 7: 1488 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, 1489 h_filter + 1, v_filter + 1, 4); 1490 break; 1491 } 1492 break; 1493 } 1494 } else { 1495 switch (yoffset) { 1496 case 2: 1497 case 4: 1498 case 6: 1499 common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4); 1500 break; 1501 1502 case 1: 1503 case 3: 1504 case 5: 1505 case 7: 1506 common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1, 1507 4); 1508 break; 1509 } 1510 } 1511 } else { 1512 switch (xoffset) { 1513 case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break; 1514 case 2: 1515 case 4: 1516 case 6: 1517 common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4); 1518 break; 1519 1520 case 1: 1521 case 3: 1522 case 5: 1523 case 7: 1524 common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4); 1525 break; 1526 } 1527 } 1528 } 1529 1530 void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 1531 int32_t xoffset, int32_t yoffset, 1532 uint8_t *RESTRICT dst, int32_t dst_stride) { 1533 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; 1534 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; 1535 1536 if (yoffset) { 1537 if (xoffset) { 1538 switch (xoffset) { 1539 case 2: 1540 case 4: 1541 case 6: 1542 switch (yoffset) { 1543 case 2: 1544 case 4: 1545 case 6: 1546 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, 1547 h_filter, v_filter, 8); 1548 break; 1549 1550 case 1: 1551 case 3: 1552 case 5: 1553 case 7: 1554 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, 1555 h_filter, v_filter + 1, 8); 1556 break; 1557 } 1558 break; 1559 1560 case 1: 1561 case 3: 1562 case 5: 1563 case 7: 1564 switch (yoffset) { 1565 case 2: 1566 case 4: 1567 case 6: 1568 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, 1569 h_filter + 1, v_filter, 8); 1570 break; 1571 1572 case 1: 1573 case 3: 1574 case 5: 1575 case 7: 1576 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, 1577 h_filter + 1, v_filter + 1, 8); 1578 break; 1579 } 1580 break; 1581 } 1582 } else { 1583 switch (yoffset) { 1584 case 2: 1585 case 4: 1586 case 6: 1587 common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8); 1588 break; 1589 1590 case 1: 1591 case 3: 1592 case 5: 1593 case 7: 1594 common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1, 1595 8); 1596 break; 1597 } 1598 } 1599 } else { 1600 switch (xoffset) { 1601 case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break; 1602 case 2: 1603 case 4: 1604 case 6: 1605 common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8); 1606 break; 1607 1608 case 1: 1609 case 3: 1610 case 5: 1611 case 7: 1612 common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8); 1613 break; 1614 } 1615 } 1616 } 1617 1618 void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride, 1619 int32_t xoffset, int32_t yoffset, 1620 uint8_t *RESTRICT dst, int32_t dst_stride) { 1621 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; 1622 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; 1623 1624 if (yoffset) { 1625 if (xoffset) { 1626 switch (xoffset) { 1627 case 2: 1628 case 4: 1629 case 6: 1630 switch (yoffset) { 1631 case 2: 1632 case 4: 1633 case 6: 1634 common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride, 1635 h_filter, v_filter, 16); 1636 break; 1637 1638 case 1: 1639 case 3: 1640 case 5: 1641 case 7: 1642 common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride, 1643 h_filter, v_filter + 1, 16); 1644 break; 1645 } 1646 break; 1647 1648 case 1: 1649 case 3: 1650 case 5: 1651 case 7: 1652 switch (yoffset) { 1653 case 2: 1654 case 4: 1655 case 6: 1656 common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride, 1657 h_filter + 1, v_filter, 16); 1658 break; 1659 1660 case 1: 1661 case 3: 1662 case 5: 1663 case 7: 1664 common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride, 1665 h_filter + 1, v_filter + 1, 16); 1666 break; 1667 } 1668 break; 1669 } 1670 } else { 1671 switch (yoffset) { 1672 case 2: 1673 case 4: 1674 case 6: 1675 common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16); 1676 break; 1677 1678 case 1: 1679 case 3: 1680 case 5: 1681 case 7: 1682 common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1, 1683 16); 1684 break; 1685 } 1686 } 1687 } else { 1688 switch (xoffset) { 1689 case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break; 1690 case 2: 1691 case 4: 1692 case 6: 1693 common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16); 1694 break; 1695 1696 case 1: 1697 case 3: 1698 case 5: 1699 case 7: 1700 common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 1701 16); 1702 break; 1703 } 1704 } 1705 } 1706