1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_dsp/mips/macros_msa.h" 13 14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ 15 { \ 16 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ 17 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ 18 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ 19 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ 20 } 21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) 22 23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, 24 const uint8_t *ref_ptr, int32_t ref_stride, 25 int32_t height) { 26 int32_t ht_cnt; 27 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 28 v16u8 src = { 0 }; 29 v16u8 ref = { 0 }; 30 v16u8 diff; 31 v8u16 sad = { 0 }; 32 33 for (ht_cnt = (height >> 2); ht_cnt--;) { 34 LW4(src_ptr, src_stride, src0, src1, src2, src3); 35 src_ptr += (4 * src_stride); 36 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 37 ref_ptr += (4 * ref_stride); 38 39 INSERT_W4_UB(src0, src1, src2, src3, src); 40 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 41 42 diff = __msa_asub_u_b(src, ref); 43 sad += __msa_hadd_u_h(diff, diff); 44 } 45 46 return HADD_UH_U32(sad); 47 } 48 49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, 50 const uint8_t *ref, int32_t ref_stride, 51 int32_t height) { 52 int32_t ht_cnt; 53 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 54 v8u16 sad = { 0 }; 55 56 for (ht_cnt = (height >> 2); ht_cnt--;) { 57 LD_UB4(src, src_stride, src0, src1, src2, src3); 58 src += (4 * src_stride); 59 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 60 ref += (4 * ref_stride); 61 62 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, 63 ref0, ref1); 64 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 65 } 66 67 return HADD_UH_U32(sad); 68 } 69 70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, 71 const uint8_t *ref, int32_t ref_stride, 72 int32_t height) { 73 int32_t ht_cnt; 74 v16u8 src0, src1, ref0, ref1; 75 v8u16 sad = { 0 }; 76 77 for (ht_cnt = (height >> 2); ht_cnt--;) { 78 LD_UB2(src, src_stride, src0, src1); 79 src += (2 * src_stride); 80 LD_UB2(ref, ref_stride, ref0, ref1); 81 ref += (2 * ref_stride); 82 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 83 84 LD_UB2(src, src_stride, src0, src1); 85 src += (2 * src_stride); 86 LD_UB2(ref, ref_stride, ref0, ref1); 87 ref += (2 * ref_stride); 88 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 89 } 90 91 return HADD_UH_U32(sad); 92 } 93 94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, 95 const uint8_t *ref, int32_t ref_stride, 96 int32_t height) { 97 int32_t ht_cnt; 98 v16u8 src0, src1, ref0, ref1; 99 v8u16 sad = { 0 }; 100 101 for (ht_cnt = (height >> 2); ht_cnt--;) { 102 LD_UB2(src, 16, src0, src1); 103 src += src_stride; 104 LD_UB2(ref, 16, ref0, ref1); 105 ref += ref_stride; 106 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 107 108 LD_UB2(src, 16, src0, src1); 109 src += src_stride; 110 LD_UB2(ref, 16, ref0, ref1); 111 ref += ref_stride; 112 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 113 114 LD_UB2(src, 16, src0, src1); 115 src += src_stride; 116 LD_UB2(ref, 16, ref0, ref1); 117 ref += ref_stride; 118 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 119 120 LD_UB2(src, 16, src0, src1); 121 src += src_stride; 122 LD_UB2(ref, 16, ref0, ref1); 123 ref += ref_stride; 124 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 125 } 126 127 return HADD_UH_U32(sad); 128 } 129 130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, 131 const uint8_t *ref, int32_t ref_stride, 132 int32_t height) { 133 int32_t ht_cnt; 134 uint32_t sad = 0; 135 v16u8 src0, src1, src2, src3; 136 v16u8 ref0, ref1, ref2, ref3; 137 v8u16 sad0 = { 0 }; 138 v8u16 sad1 = { 0 }; 139 140 for (ht_cnt = (height >> 1); ht_cnt--;) { 141 LD_UB4(src, 16, src0, src1, src2, src3); 142 src += src_stride; 143 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 144 ref += ref_stride; 145 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 146 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); 147 148 LD_UB4(src, 16, src0, src1, src2, src3); 149 src += src_stride; 150 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 151 ref += ref_stride; 152 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 153 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); 154 } 155 156 sad = HADD_UH_U32(sad0); 157 sad += HADD_UH_U32(sad1); 158 159 return sad; 160 } 161 162 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, 163 const uint8_t *ref_ptr, int32_t ref_stride, 164 int32_t height, uint32_t *sad_array) { 165 int32_t ht_cnt; 166 uint32_t src0, src1, src2, src3; 167 v16u8 src = { 0 }; 168 v16u8 ref = { 0 }; 169 v16u8 ref0, ref1, ref2, ref3, diff; 170 v8u16 sad0 = { 0 }; 171 v8u16 sad1 = { 0 }; 172 v8u16 sad2 = { 0 }; 173 174 for (ht_cnt = (height >> 2); ht_cnt--;) { 175 LW4(src_ptr, src_stride, src0, src1, src2, src3); 176 src_ptr += (4 * src_stride); 177 INSERT_W4_UB(src0, src1, src2, src3, src); 178 179 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 180 ref_ptr += (4 * ref_stride); 181 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 182 diff = __msa_asub_u_b(src, ref); 183 sad0 += __msa_hadd_u_h(diff, diff); 184 185 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 186 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 187 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 188 diff = __msa_asub_u_b(src, ref); 189 sad1 += __msa_hadd_u_h(diff, diff); 190 191 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 192 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 193 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 194 diff = __msa_asub_u_b(src, ref); 195 sad2 += __msa_hadd_u_h(diff, diff); 196 } 197 198 sad_array[0] = HADD_UH_U32(sad0); 199 sad_array[1] = HADD_UH_U32(sad1); 200 sad_array[2] = HADD_UH_U32(sad2); 201 } 202 203 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, 204 const uint8_t *ref, int32_t ref_stride, 205 int32_t height, uint32_t *sad_array) { 206 int32_t ht_cnt; 207 v16u8 src0, src1, src2, src3; 208 v16u8 ref0, ref1, ref00, ref11, ref22, ref33; 209 v8u16 sad0 = { 0 }; 210 v8u16 sad1 = { 0 }; 211 v8u16 sad2 = { 0 }; 212 213 for (ht_cnt = (height >> 2); ht_cnt--;) { 214 LD_UB4(src, src_stride, src0, src1, src2, src3); 215 src += (4 * src_stride); 216 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); 217 ref += (4 * ref_stride); 218 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, 219 ref0, ref1); 220 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 221 222 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 223 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 224 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 225 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 226 227 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 228 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 229 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 230 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 231 } 232 233 sad_array[0] = HADD_UH_U32(sad0); 234 sad_array[1] = HADD_UH_U32(sad1); 235 sad_array[2] = HADD_UH_U32(sad2); 236 } 237 238 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, 239 const uint8_t *ref_ptr, int32_t ref_stride, 240 int32_t height, uint32_t *sad_array) { 241 int32_t ht_cnt; 242 v16u8 src, ref, ref0, ref1, diff; 243 v8u16 sad0 = { 0 }; 244 v8u16 sad1 = { 0 }; 245 v8u16 sad2 = { 0 }; 246 247 for (ht_cnt = (height >> 1); ht_cnt--;) { 248 src = LD_UB(src_ptr); 249 src_ptr += src_stride; 250 LD_UB2(ref_ptr, 16, ref0, ref1); 251 ref_ptr += ref_stride; 252 253 diff = __msa_asub_u_b(src, ref0); 254 sad0 += __msa_hadd_u_h(diff, diff); 255 256 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 257 diff = __msa_asub_u_b(src, ref); 258 sad1 += __msa_hadd_u_h(diff, diff); 259 260 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 261 diff = __msa_asub_u_b(src, ref); 262 sad2 += __msa_hadd_u_h(diff, diff); 263 264 src = LD_UB(src_ptr); 265 src_ptr += src_stride; 266 LD_UB2(ref_ptr, 16, ref0, ref1); 267 ref_ptr += ref_stride; 268 269 diff = __msa_asub_u_b(src, ref0); 270 sad0 += __msa_hadd_u_h(diff, diff); 271 272 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 273 diff = __msa_asub_u_b(src, ref); 274 sad1 += __msa_hadd_u_h(diff, diff); 275 276 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 277 diff = __msa_asub_u_b(src, ref); 278 sad2 += __msa_hadd_u_h(diff, diff); 279 } 280 281 sad_array[0] = HADD_UH_U32(sad0); 282 sad_array[1] = HADD_UH_U32(sad1); 283 sad_array[2] = HADD_UH_U32(sad2); 284 } 285 286 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, 287 const uint8_t *ref_ptr, int32_t ref_stride, 288 int32_t height, uint32_t *sad_array) { 289 int32_t ht_cnt; 290 uint32_t src0, src1, src2, src3; 291 v16u8 ref0, ref1, ref2, ref3, diff; 292 v16u8 src = { 0 }; 293 v16u8 ref = { 0 }; 294 v8u16 sad0 = { 0 }; 295 v8u16 sad1 = { 0 }; 296 v8u16 sad2 = { 0 }; 297 v8u16 sad3 = { 0 }; 298 v8u16 sad4 = { 0 }; 299 v8u16 sad5 = { 0 }; 300 v8u16 sad6 = { 0 }; 301 v8u16 sad7 = { 0 }; 302 303 for (ht_cnt = (height >> 2); ht_cnt--;) { 304 LW4(src_ptr, src_stride, src0, src1, src2, src3); 305 INSERT_W4_UB(src0, src1, src2, src3, src); 306 src_ptr += (4 * src_stride); 307 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 308 ref_ptr += (4 * ref_stride); 309 310 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 311 diff = __msa_asub_u_b(src, ref); 312 sad0 += __msa_hadd_u_h(diff, diff); 313 314 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 315 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 316 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 317 diff = __msa_asub_u_b(src, ref); 318 sad1 += __msa_hadd_u_h(diff, diff); 319 320 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 321 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 322 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 323 diff = __msa_asub_u_b(src, ref); 324 sad2 += __msa_hadd_u_h(diff, diff); 325 326 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 327 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 328 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 329 diff = __msa_asub_u_b(src, ref); 330 sad3 += __msa_hadd_u_h(diff, diff); 331 332 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 333 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 334 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 335 diff = __msa_asub_u_b(src, ref); 336 sad4 += __msa_hadd_u_h(diff, diff); 337 338 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 339 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 340 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 341 diff = __msa_asub_u_b(src, ref); 342 sad5 += __msa_hadd_u_h(diff, diff); 343 344 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 345 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 346 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 347 diff = __msa_asub_u_b(src, ref); 348 sad6 += __msa_hadd_u_h(diff, diff); 349 350 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 351 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 352 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 353 diff = __msa_asub_u_b(src, ref); 354 sad7 += __msa_hadd_u_h(diff, diff); 355 } 356 357 sad_array[0] = HADD_UH_U32(sad0); 358 sad_array[1] = HADD_UH_U32(sad1); 359 sad_array[2] = HADD_UH_U32(sad2); 360 sad_array[3] = HADD_UH_U32(sad3); 361 sad_array[4] = HADD_UH_U32(sad4); 362 sad_array[5] = HADD_UH_U32(sad5); 363 sad_array[6] = HADD_UH_U32(sad6); 364 sad_array[7] = HADD_UH_U32(sad7); 365 } 366 367 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, 368 const uint8_t *ref, int32_t ref_stride, 369 int32_t height, uint32_t *sad_array) { 370 int32_t ht_cnt; 371 v16u8 src0, src1, src2, src3; 372 v16u8 ref0, ref1, ref00, ref11, ref22, ref33; 373 v8u16 sad0 = { 0 }; 374 v8u16 sad1 = { 0 }; 375 v8u16 sad2 = { 0 }; 376 v8u16 sad3 = { 0 }; 377 v8u16 sad4 = { 0 }; 378 v8u16 sad5 = { 0 }; 379 v8u16 sad6 = { 0 }; 380 v8u16 sad7 = { 0 }; 381 382 for (ht_cnt = (height >> 2); ht_cnt--;) { 383 LD_UB4(src, src_stride, src0, src1, src2, src3); 384 src += (4 * src_stride); 385 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); 386 ref += (4 * ref_stride); 387 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, 388 ref0, ref1); 389 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 390 391 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 392 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 393 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 394 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 395 396 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 397 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 398 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 399 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 400 401 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 402 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 403 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 404 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 405 406 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 407 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 408 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 409 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); 410 411 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 412 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 413 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 414 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); 415 416 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 417 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 418 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 419 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); 420 421 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 422 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 423 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 424 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); 425 } 426 427 sad_array[0] = HADD_UH_U32(sad0); 428 sad_array[1] = HADD_UH_U32(sad1); 429 sad_array[2] = HADD_UH_U32(sad2); 430 sad_array[3] = HADD_UH_U32(sad3); 431 sad_array[4] = HADD_UH_U32(sad4); 432 sad_array[5] = HADD_UH_U32(sad5); 433 sad_array[6] = HADD_UH_U32(sad6); 434 sad_array[7] = HADD_UH_U32(sad7); 435 } 436 437 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, 438 const uint8_t *ref_ptr, int32_t ref_stride, 439 int32_t height, uint32_t *sad_array) { 440 int32_t ht_cnt; 441 v16u8 src, ref0, ref1, ref; 442 v16u8 diff; 443 v8u16 sad0 = { 0 }; 444 v8u16 sad1 = { 0 }; 445 v8u16 sad2 = { 0 }; 446 v8u16 sad3 = { 0 }; 447 v8u16 sad4 = { 0 }; 448 v8u16 sad5 = { 0 }; 449 v8u16 sad6 = { 0 }; 450 v8u16 sad7 = { 0 }; 451 452 for (ht_cnt = (height >> 1); ht_cnt--;) { 453 src = LD_UB(src_ptr); 454 src_ptr += src_stride; 455 LD_UB2(ref_ptr, 16, ref0, ref1); 456 ref_ptr += ref_stride; 457 458 diff = __msa_asub_u_b(src, ref0); 459 sad0 += __msa_hadd_u_h(diff, diff); 460 461 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 462 diff = __msa_asub_u_b(src, ref); 463 sad1 += __msa_hadd_u_h(diff, diff); 464 465 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 466 diff = __msa_asub_u_b(src, ref); 467 sad2 += __msa_hadd_u_h(diff, diff); 468 469 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); 470 diff = __msa_asub_u_b(src, ref); 471 sad3 += __msa_hadd_u_h(diff, diff); 472 473 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); 474 diff = __msa_asub_u_b(src, ref); 475 sad4 += __msa_hadd_u_h(diff, diff); 476 477 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); 478 diff = __msa_asub_u_b(src, ref); 479 sad5 += __msa_hadd_u_h(diff, diff); 480 481 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); 482 diff = __msa_asub_u_b(src, ref); 483 sad6 += __msa_hadd_u_h(diff, diff); 484 485 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); 486 diff = __msa_asub_u_b(src, ref); 487 sad7 += __msa_hadd_u_h(diff, diff); 488 489 src = LD_UB(src_ptr); 490 src_ptr += src_stride; 491 LD_UB2(ref_ptr, 16, ref0, ref1); 492 ref_ptr += ref_stride; 493 494 diff = __msa_asub_u_b(src, ref0); 495 sad0 += __msa_hadd_u_h(diff, diff); 496 497 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 498 diff = __msa_asub_u_b(src, ref); 499 sad1 += __msa_hadd_u_h(diff, diff); 500 501 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 502 diff = __msa_asub_u_b(src, ref); 503 sad2 += __msa_hadd_u_h(diff, diff); 504 505 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); 506 diff = __msa_asub_u_b(src, ref); 507 sad3 += __msa_hadd_u_h(diff, diff); 508 509 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); 510 diff = __msa_asub_u_b(src, ref); 511 sad4 += __msa_hadd_u_h(diff, diff); 512 513 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); 514 diff = __msa_asub_u_b(src, ref); 515 sad5 += __msa_hadd_u_h(diff, diff); 516 517 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); 518 diff = __msa_asub_u_b(src, ref); 519 sad6 += __msa_hadd_u_h(diff, diff); 520 521 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); 522 diff = __msa_asub_u_b(src, ref); 523 sad7 += __msa_hadd_u_h(diff, diff); 524 } 525 526 sad_array[0] = HADD_UH_U32(sad0); 527 sad_array[1] = HADD_UH_U32(sad1); 528 sad_array[2] = HADD_UH_U32(sad2); 529 sad_array[3] = HADD_UH_U32(sad3); 530 sad_array[4] = HADD_UH_U32(sad4); 531 sad_array[5] = HADD_UH_U32(sad5); 532 sad_array[6] = HADD_UH_U32(sad6); 533 sad_array[7] = HADD_UH_U32(sad7); 534 } 535 536 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 537 const uint8_t *const aref_ptr[], 538 int32_t ref_stride, int32_t height, 539 uint32_t *sad_array) { 540 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 541 int32_t ht_cnt; 542 uint32_t src0, src1, src2, src3; 543 uint32_t ref0, ref1, ref2, ref3; 544 v16u8 src = { 0 }; 545 v16u8 ref = { 0 }; 546 v16u8 diff; 547 v8u16 sad0 = { 0 }; 548 v8u16 sad1 = { 0 }; 549 v8u16 sad2 = { 0 }; 550 v8u16 sad3 = { 0 }; 551 552 ref0_ptr = aref_ptr[0]; 553 ref1_ptr = aref_ptr[1]; 554 ref2_ptr = aref_ptr[2]; 555 ref3_ptr = aref_ptr[3]; 556 557 for (ht_cnt = (height >> 2); ht_cnt--;) { 558 LW4(src_ptr, src_stride, src0, src1, src2, src3); 559 INSERT_W4_UB(src0, src1, src2, src3, src); 560 src_ptr += (4 * src_stride); 561 562 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); 563 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 564 ref0_ptr += (4 * ref_stride); 565 566 diff = __msa_asub_u_b(src, ref); 567 sad0 += __msa_hadd_u_h(diff, diff); 568 569 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); 570 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 571 ref1_ptr += (4 * ref_stride); 572 573 diff = __msa_asub_u_b(src, ref); 574 sad1 += __msa_hadd_u_h(diff, diff); 575 576 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); 577 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 578 ref2_ptr += (4 * ref_stride); 579 580 diff = __msa_asub_u_b(src, ref); 581 sad2 += __msa_hadd_u_h(diff, diff); 582 583 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); 584 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 585 ref3_ptr += (4 * ref_stride); 586 587 diff = __msa_asub_u_b(src, ref); 588 sad3 += __msa_hadd_u_h(diff, diff); 589 } 590 591 sad_array[0] = HADD_UH_U32(sad0); 592 sad_array[1] = HADD_UH_U32(sad1); 593 sad_array[2] = HADD_UH_U32(sad2); 594 sad_array[3] = HADD_UH_U32(sad3); 595 } 596 597 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 598 const uint8_t *const aref_ptr[], 599 int32_t ref_stride, int32_t height, 600 uint32_t *sad_array) { 601 int32_t ht_cnt; 602 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 603 v16u8 src0, src1, src2, src3; 604 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 605 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; 606 v8u16 sad0 = { 0 }; 607 v8u16 sad1 = { 0 }; 608 v8u16 sad2 = { 0 }; 609 v8u16 sad3 = { 0 }; 610 611 ref0_ptr = aref_ptr[0]; 612 ref1_ptr = aref_ptr[1]; 613 ref2_ptr = aref_ptr[2]; 614 ref3_ptr = aref_ptr[3]; 615 616 for (ht_cnt = (height >> 2); ht_cnt--;) { 617 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 618 src_ptr += (4 * src_stride); 619 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); 620 ref0_ptr += (4 * ref_stride); 621 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); 622 ref1_ptr += (4 * ref_stride); 623 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); 624 ref2_ptr += (4 * ref_stride); 625 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); 626 ref3_ptr += (4 * ref_stride); 627 628 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 629 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 630 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 631 632 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); 633 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 634 635 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); 636 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 637 638 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); 639 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 640 } 641 642 sad_array[0] = HADD_UH_U32(sad0); 643 sad_array[1] = HADD_UH_U32(sad1); 644 sad_array[2] = HADD_UH_U32(sad2); 645 sad_array[3] = HADD_UH_U32(sad3); 646 } 647 648 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 649 const uint8_t *const aref_ptr[], 650 int32_t ref_stride, int32_t height, 651 uint32_t *sad_array) { 652 int32_t ht_cnt; 653 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 654 v16u8 src, ref0, ref1, ref2, ref3, diff; 655 v8u16 sad0 = { 0 }; 656 v8u16 sad1 = { 0 }; 657 v8u16 sad2 = { 0 }; 658 v8u16 sad3 = { 0 }; 659 660 ref0_ptr = aref_ptr[0]; 661 ref1_ptr = aref_ptr[1]; 662 ref2_ptr = aref_ptr[2]; 663 ref3_ptr = aref_ptr[3]; 664 665 for (ht_cnt = (height >> 1); ht_cnt--;) { 666 src = LD_UB(src_ptr); 667 src_ptr += src_stride; 668 ref0 = LD_UB(ref0_ptr); 669 ref0_ptr += ref_stride; 670 ref1 = LD_UB(ref1_ptr); 671 ref1_ptr += ref_stride; 672 ref2 = LD_UB(ref2_ptr); 673 ref2_ptr += ref_stride; 674 ref3 = LD_UB(ref3_ptr); 675 ref3_ptr += ref_stride; 676 677 diff = __msa_asub_u_b(src, ref0); 678 sad0 += __msa_hadd_u_h(diff, diff); 679 diff = __msa_asub_u_b(src, ref1); 680 sad1 += __msa_hadd_u_h(diff, diff); 681 diff = __msa_asub_u_b(src, ref2); 682 sad2 += __msa_hadd_u_h(diff, diff); 683 diff = __msa_asub_u_b(src, ref3); 684 sad3 += __msa_hadd_u_h(diff, diff); 685 686 src = LD_UB(src_ptr); 687 src_ptr += src_stride; 688 ref0 = LD_UB(ref0_ptr); 689 ref0_ptr += ref_stride; 690 ref1 = LD_UB(ref1_ptr); 691 ref1_ptr += ref_stride; 692 ref2 = LD_UB(ref2_ptr); 693 ref2_ptr += ref_stride; 694 ref3 = LD_UB(ref3_ptr); 695 ref3_ptr += ref_stride; 696 697 diff = __msa_asub_u_b(src, ref0); 698 sad0 += __msa_hadd_u_h(diff, diff); 699 diff = __msa_asub_u_b(src, ref1); 700 sad1 += __msa_hadd_u_h(diff, diff); 701 diff = __msa_asub_u_b(src, ref2); 702 sad2 += __msa_hadd_u_h(diff, diff); 703 diff = __msa_asub_u_b(src, ref3); 704 sad3 += __msa_hadd_u_h(diff, diff); 705 } 706 707 sad_array[0] = HADD_UH_U32(sad0); 708 sad_array[1] = HADD_UH_U32(sad1); 709 sad_array[2] = HADD_UH_U32(sad2); 710 sad_array[3] = HADD_UH_U32(sad3); 711 } 712 713 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, 714 const uint8_t *const aref_ptr[], 715 int32_t ref_stride, int32_t height, 716 uint32_t *sad_array) { 717 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 718 int32_t ht_cnt; 719 v16u8 src0, src1, ref0, ref1; 720 v8u16 sad0 = { 0 }; 721 v8u16 sad1 = { 0 }; 722 v8u16 sad2 = { 0 }; 723 v8u16 sad3 = { 0 }; 724 725 ref0_ptr = aref_ptr[0]; 726 ref1_ptr = aref_ptr[1]; 727 ref2_ptr = aref_ptr[2]; 728 ref3_ptr = aref_ptr[3]; 729 730 for (ht_cnt = height; ht_cnt--;) { 731 LD_UB2(src, 16, src0, src1); 732 src += src_stride; 733 734 LD_UB2(ref0_ptr, 16, ref0, ref1); 735 ref0_ptr += ref_stride; 736 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 737 738 LD_UB2(ref1_ptr, 16, ref0, ref1); 739 ref1_ptr += ref_stride; 740 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 741 742 LD_UB2(ref2_ptr, 16, ref0, ref1); 743 ref2_ptr += ref_stride; 744 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 745 746 LD_UB2(ref3_ptr, 16, ref0, ref1); 747 ref3_ptr += ref_stride; 748 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 749 } 750 751 sad_array[0] = HADD_UH_U32(sad0); 752 sad_array[1] = HADD_UH_U32(sad1); 753 sad_array[2] = HADD_UH_U32(sad2); 754 sad_array[3] = HADD_UH_U32(sad3); 755 } 756 757 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, 758 const uint8_t *const aref_ptr[], 759 int32_t ref_stride, int32_t height, 760 uint32_t *sad_array) { 761 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 762 int32_t ht_cnt; 763 v16u8 src0, src1, src2, src3; 764 v16u8 ref0, ref1, ref2, ref3; 765 v8u16 sad0_0 = { 0 }; 766 v8u16 sad0_1 = { 0 }; 767 v8u16 sad1_0 = { 0 }; 768 v8u16 sad1_1 = { 0 }; 769 v8u16 sad2_0 = { 0 }; 770 v8u16 sad2_1 = { 0 }; 771 v8u16 sad3_0 = { 0 }; 772 v8u16 sad3_1 = { 0 }; 773 v4u32 sad; 774 775 ref0_ptr = aref_ptr[0]; 776 ref1_ptr = aref_ptr[1]; 777 ref2_ptr = aref_ptr[2]; 778 ref3_ptr = aref_ptr[3]; 779 780 for (ht_cnt = height; ht_cnt--;) { 781 LD_UB4(src, 16, src0, src1, src2, src3); 782 src += src_stride; 783 784 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); 785 ref0_ptr += ref_stride; 786 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 787 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 788 789 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); 790 ref1_ptr += ref_stride; 791 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 792 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 793 794 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); 795 ref2_ptr += ref_stride; 796 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 797 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 798 799 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); 800 ref3_ptr += ref_stride; 801 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 802 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 803 } 804 805 sad = __msa_hadd_u_w(sad0_0, sad0_0); 806 sad += __msa_hadd_u_w(sad0_1, sad0_1); 807 sad_array[0] = HADD_UW_U32(sad); 808 809 sad = __msa_hadd_u_w(sad1_0, sad1_0); 810 sad += __msa_hadd_u_w(sad1_1, sad1_1); 811 sad_array[1] = HADD_UW_U32(sad); 812 813 sad = __msa_hadd_u_w(sad2_0, sad2_0); 814 sad += __msa_hadd_u_w(sad2_1, sad2_1); 815 sad_array[2] = HADD_UW_U32(sad); 816 817 sad = __msa_hadd_u_w(sad3_0, sad3_0); 818 sad += __msa_hadd_u_w(sad3_1, sad3_1); 819 sad_array[3] = HADD_UW_U32(sad); 820 } 821 822 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, 823 const uint8_t *ref_ptr, int32_t ref_stride, 824 int32_t height, const uint8_t *sec_pred) { 825 int32_t ht_cnt; 826 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 827 v16u8 src = { 0 }; 828 v16u8 ref = { 0 }; 829 v16u8 diff, pred, comp; 830 v8u16 sad = { 0 }; 831 832 for (ht_cnt = (height >> 2); ht_cnt--;) { 833 LW4(src_ptr, src_stride, src0, src1, src2, src3); 834 src_ptr += (4 * src_stride); 835 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 836 ref_ptr += (4 * ref_stride); 837 pred = LD_UB(sec_pred); 838 sec_pred += 16; 839 840 INSERT_W4_UB(src0, src1, src2, src3, src); 841 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 842 843 comp = __msa_aver_u_b(pred, ref); 844 diff = __msa_asub_u_b(src, comp); 845 sad += __msa_hadd_u_h(diff, diff); 846 } 847 848 return HADD_UH_U32(sad); 849 } 850 851 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, 852 const uint8_t *ref, int32_t ref_stride, 853 int32_t height, const uint8_t *sec_pred) { 854 int32_t ht_cnt; 855 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 856 v16u8 diff0, diff1, pred0, pred1; 857 v8u16 sad = { 0 }; 858 859 for (ht_cnt = (height >> 2); ht_cnt--;) { 860 LD_UB4(src, src_stride, src0, src1, src2, src3); 861 src += (4 * src_stride); 862 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 863 ref += (4 * ref_stride); 864 LD_UB2(sec_pred, 16, pred0, pred1); 865 sec_pred += 32; 866 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, 867 ref0, ref1); 868 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); 869 sad += SAD_UB2_UH(src0, src1, diff0, diff1); 870 } 871 872 return HADD_UH_U32(sad); 873 } 874 875 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, 876 const uint8_t *ref, int32_t ref_stride, 877 int32_t height, const uint8_t *sec_pred) { 878 int32_t ht_cnt; 879 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 880 v16u8 pred0, pred1, pred2, pred3, comp0, comp1; 881 v8u16 sad = { 0 }; 882 883 for (ht_cnt = (height >> 3); ht_cnt--;) { 884 LD_UB4(src, src_stride, src0, src1, src2, src3); 885 src += (4 * src_stride); 886 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 887 ref += (4 * ref_stride); 888 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 889 sec_pred += (4 * 16); 890 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 891 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 892 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 893 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 894 895 LD_UB4(src, src_stride, src0, src1, src2, src3); 896 src += (4 * src_stride); 897 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 898 ref += (4 * ref_stride); 899 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 900 sec_pred += (4 * 16); 901 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 902 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 903 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 904 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 905 } 906 907 return HADD_UH_U32(sad); 908 } 909 910 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, 911 const uint8_t *ref, int32_t ref_stride, 912 int32_t height, const uint8_t *sec_pred) { 913 int32_t ht_cnt; 914 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 915 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 916 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; 917 v16u8 comp0, comp1; 918 v8u16 sad = { 0 }; 919 920 for (ht_cnt = (height >> 2); ht_cnt--;) { 921 LD_UB4(src, src_stride, src0, src2, src4, src6); 922 LD_UB4(src + 16, src_stride, src1, src3, src5, src7); 923 src += (4 * src_stride); 924 925 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); 926 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); 927 ref += (4 * ref_stride); 928 929 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); 930 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); 931 sec_pred += (4 * 32); 932 933 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 934 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 935 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 936 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 937 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); 938 sad += SAD_UB2_UH(src4, src5, comp0, comp1); 939 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); 940 sad += SAD_UB2_UH(src6, src7, comp0, comp1); 941 } 942 943 return HADD_UH_U32(sad); 944 } 945 946 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, 947 const uint8_t *ref, int32_t ref_stride, 948 int32_t height, const uint8_t *sec_pred) { 949 int32_t ht_cnt; 950 v16u8 src0, src1, src2, src3; 951 v16u8 ref0, ref1, ref2, ref3; 952 v16u8 comp0, comp1, comp2, comp3; 953 v16u8 pred0, pred1, pred2, pred3; 954 v8u16 sad0 = { 0 }; 955 v8u16 sad1 = { 0 }; 956 v4u32 sad; 957 958 for (ht_cnt = (height >> 2); ht_cnt--;) { 959 LD_UB4(src, 16, src0, src1, src2, src3); 960 src += src_stride; 961 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 962 ref += ref_stride; 963 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 964 sec_pred += 64; 965 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 966 comp1, comp2, comp3); 967 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 968 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 969 970 LD_UB4(src, 16, src0, src1, src2, src3); 971 src += src_stride; 972 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 973 ref += ref_stride; 974 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 975 sec_pred += 64; 976 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 977 comp1, comp2, comp3); 978 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 979 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 980 981 LD_UB4(src, 16, src0, src1, src2, src3); 982 src += src_stride; 983 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 984 ref += ref_stride; 985 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 986 sec_pred += 64; 987 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 988 comp1, comp2, comp3); 989 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 990 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 991 992 LD_UB4(src, 16, src0, src1, src2, src3); 993 src += src_stride; 994 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 995 ref += ref_stride; 996 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 997 sec_pred += 64; 998 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 999 comp1, comp2, comp3); 1000 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1001 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1002 } 1003 1004 sad = __msa_hadd_u_w(sad0, sad0); 1005 sad += __msa_hadd_u_w(sad1, sad1); 1006 1007 return HADD_SW_S32(sad); 1008 } 1009 1010 #define VPX_SAD_4xHEIGHT_MSA(height) \ 1011 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1012 const uint8_t *ref, int32_t ref_stride) { \ 1013 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ 1014 } 1015 1016 #define VPX_SAD_8xHEIGHT_MSA(height) \ 1017 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1018 const uint8_t *ref, int32_t ref_stride) { \ 1019 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ 1020 } 1021 1022 #define VPX_SAD_16xHEIGHT_MSA(height) \ 1023 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1024 const uint8_t *ref, int32_t ref_stride) { \ 1025 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ 1026 } 1027 1028 #define VPX_SAD_32xHEIGHT_MSA(height) \ 1029 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1030 const uint8_t *ref, int32_t ref_stride) { \ 1031 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ 1032 } 1033 1034 #define VPX_SAD_64xHEIGHT_MSA(height) \ 1035 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1036 const uint8_t *ref, int32_t ref_stride) { \ 1037 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ 1038 } 1039 1040 #define VPX_SAD_4xHEIGHTx3_MSA(height) \ 1041 void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1042 const uint8_t *ref, int32_t ref_stride, \ 1043 uint32_t *sads) { \ 1044 sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1045 } 1046 1047 #define VPX_SAD_8xHEIGHTx3_MSA(height) \ 1048 void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1049 const uint8_t *ref, int32_t ref_stride, \ 1050 uint32_t *sads) { \ 1051 sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1052 } 1053 1054 #define VPX_SAD_16xHEIGHTx3_MSA(height) \ 1055 void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1056 const uint8_t *ref, int32_t ref_stride, \ 1057 uint32_t *sads) { \ 1058 sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1059 } 1060 1061 #define VPX_SAD_4xHEIGHTx8_MSA(height) \ 1062 void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1063 const uint8_t *ref, int32_t ref_stride, \ 1064 uint32_t *sads) { \ 1065 sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1066 } 1067 1068 #define VPX_SAD_8xHEIGHTx8_MSA(height) \ 1069 void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1070 const uint8_t *ref, int32_t ref_stride, \ 1071 uint32_t *sads) { \ 1072 sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1073 } 1074 1075 #define VPX_SAD_16xHEIGHTx8_MSA(height) \ 1076 void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1077 const uint8_t *ref, int32_t ref_stride, \ 1078 uint32_t *sads) { \ 1079 sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1080 } 1081 1082 #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ 1083 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1084 const uint8_t *const refs[], \ 1085 int32_t ref_stride, uint32_t *sads) { \ 1086 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1087 } 1088 1089 #define VPX_SAD_8xHEIGHTx4D_MSA(height) \ 1090 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1091 const uint8_t *const refs[], \ 1092 int32_t ref_stride, uint32_t *sads) { \ 1093 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1094 } 1095 1096 #define VPX_SAD_16xHEIGHTx4D_MSA(height) \ 1097 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1098 const uint8_t *const refs[], \ 1099 int32_t ref_stride, uint32_t *sads) { \ 1100 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1101 } 1102 1103 #define VPX_SAD_32xHEIGHTx4D_MSA(height) \ 1104 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1105 const uint8_t *const refs[], \ 1106 int32_t ref_stride, uint32_t *sads) { \ 1107 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1108 } 1109 1110 #define VPX_SAD_64xHEIGHTx4D_MSA(height) \ 1111 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1112 const uint8_t *const refs[], \ 1113 int32_t ref_stride, uint32_t *sads) { \ 1114 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1115 } 1116 1117 #define VPX_AVGSAD_4xHEIGHT_MSA(height) \ 1118 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1119 const uint8_t *ref, int32_t ref_stride, \ 1120 const uint8_t *second_pred) { \ 1121 return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ 1122 second_pred); \ 1123 } 1124 1125 #define VPX_AVGSAD_8xHEIGHT_MSA(height) \ 1126 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1127 const uint8_t *ref, int32_t ref_stride, \ 1128 const uint8_t *second_pred) { \ 1129 return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ 1130 second_pred); \ 1131 } 1132 1133 #define VPX_AVGSAD_16xHEIGHT_MSA(height) \ 1134 uint32_t vpx_sad16x##height##_avg_msa( \ 1135 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ 1136 int32_t ref_stride, const uint8_t *second_pred) { \ 1137 return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ 1138 second_pred); \ 1139 } 1140 1141 #define VPX_AVGSAD_32xHEIGHT_MSA(height) \ 1142 uint32_t vpx_sad32x##height##_avg_msa( \ 1143 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ 1144 int32_t ref_stride, const uint8_t *second_pred) { \ 1145 return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ 1146 second_pred); \ 1147 } 1148 1149 #define VPX_AVGSAD_64xHEIGHT_MSA(height) \ 1150 uint32_t vpx_sad64x##height##_avg_msa( \ 1151 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ 1152 int32_t ref_stride, const uint8_t *second_pred) { \ 1153 return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ 1154 second_pred); \ 1155 } 1156 1157 // 64x64 1158 VPX_SAD_64xHEIGHT_MSA(64); 1159 VPX_SAD_64xHEIGHTx4D_MSA(64); 1160 VPX_AVGSAD_64xHEIGHT_MSA(64); 1161 1162 // 64x32 1163 VPX_SAD_64xHEIGHT_MSA(32); 1164 VPX_SAD_64xHEIGHTx4D_MSA(32); 1165 VPX_AVGSAD_64xHEIGHT_MSA(32); 1166 1167 // 32x64 1168 VPX_SAD_32xHEIGHT_MSA(64); 1169 VPX_SAD_32xHEIGHTx4D_MSA(64); 1170 VPX_AVGSAD_32xHEIGHT_MSA(64); 1171 1172 // 32x32 1173 VPX_SAD_32xHEIGHT_MSA(32); 1174 VPX_SAD_32xHEIGHTx4D_MSA(32); 1175 VPX_AVGSAD_32xHEIGHT_MSA(32); 1176 1177 // 32x16 1178 VPX_SAD_32xHEIGHT_MSA(16); 1179 VPX_SAD_32xHEIGHTx4D_MSA(16); 1180 VPX_AVGSAD_32xHEIGHT_MSA(16); 1181 1182 // 16x32 1183 VPX_SAD_16xHEIGHT_MSA(32); 1184 VPX_SAD_16xHEIGHTx4D_MSA(32); 1185 VPX_AVGSAD_16xHEIGHT_MSA(32); 1186 1187 // 16x16 1188 VPX_SAD_16xHEIGHT_MSA(16); 1189 VPX_SAD_16xHEIGHTx3_MSA(16); 1190 VPX_SAD_16xHEIGHTx8_MSA(16); 1191 VPX_SAD_16xHEIGHTx4D_MSA(16); 1192 VPX_AVGSAD_16xHEIGHT_MSA(16); 1193 1194 // 16x8 1195 VPX_SAD_16xHEIGHT_MSA(8); 1196 VPX_SAD_16xHEIGHTx3_MSA(8); 1197 VPX_SAD_16xHEIGHTx8_MSA(8); 1198 VPX_SAD_16xHEIGHTx4D_MSA(8); 1199 VPX_AVGSAD_16xHEIGHT_MSA(8); 1200 1201 // 8x16 1202 VPX_SAD_8xHEIGHT_MSA(16); 1203 VPX_SAD_8xHEIGHTx3_MSA(16); 1204 VPX_SAD_8xHEIGHTx8_MSA(16); 1205 VPX_SAD_8xHEIGHTx4D_MSA(16); 1206 VPX_AVGSAD_8xHEIGHT_MSA(16); 1207 1208 // 8x8 1209 VPX_SAD_8xHEIGHT_MSA(8); 1210 VPX_SAD_8xHEIGHTx3_MSA(8); 1211 VPX_SAD_8xHEIGHTx8_MSA(8); 1212 VPX_SAD_8xHEIGHTx4D_MSA(8); 1213 VPX_AVGSAD_8xHEIGHT_MSA(8); 1214 1215 // 8x4 1216 VPX_SAD_8xHEIGHT_MSA(4); 1217 VPX_SAD_8xHEIGHTx4D_MSA(4); 1218 VPX_AVGSAD_8xHEIGHT_MSA(4); 1219 1220 // 4x8 1221 VPX_SAD_4xHEIGHT_MSA(8); 1222 VPX_SAD_4xHEIGHTx4D_MSA(8); 1223 VPX_AVGSAD_4xHEIGHT_MSA(8); 1224 1225 // 4x4 1226 VPX_SAD_4xHEIGHT_MSA(4); 1227 VPX_SAD_4xHEIGHTx3_MSA(4); 1228 VPX_SAD_4xHEIGHTx8_MSA(4); 1229 VPX_SAD_4xHEIGHTx4D_MSA(4); 1230 VPX_AVGSAD_4xHEIGHT_MSA(4); 1231