1 2 /* filter_msa_intrinsics.c - MSA optimised filter functions 3 * 4 * Copyright (c) 2016 Glenn Randers-Pehrson 5 * Written by Mandar Sahastrabuddhe, August 2016. 6 * Last changed in libpng 1.6.25 [September 1, 2016] 7 * 8 * This code is released under the libpng license. 9 * For conditions of distribution and use, see the disclaimer 10 * and license in png.h 11 */ 12 #include <stdio.h> 13 #include <stdint.h> 14 #include "../pngpriv.h" 15 16 #ifdef PNG_READ_SUPPORTED 17 18 /* This code requires -mfpu=msa on the command line: */ 19 #if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ 20 21 #include <msa.h> 22 23 /* libpng row pointers are not necessarily aligned to any particular boundary, 24 * however this code will only work with appropriate alignment. mips/mips_init.c 25 * checks for this (and will not compile unless it is done). This code uses 26 * variants of png_aligncast to avoid compiler warnings. 27 */ 28 #define png_ptr(type,pointer) png_aligncast(type *,pointer) 29 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) 30 31 /* The following relies on a variable 'temp_pointer' being declared with type 32 * 'type'. This is written this way just to hide the GCC strict aliasing 33 * warning; note that the code is safe because there never is an alias between 34 * the input and output pointers. 35 */ 36 #define png_ldr(type,pointer)\ 37 (temp_pointer = png_ptr(type,pointer), *temp_pointer) 38 39 #if PNG_MIPS_MSA_OPT > 0 40 41 #ifdef CLANG_BUILD 42 #define MSA_SRLI_B(a, b) __msa_srli_b((v16i8) a, b) 43 44 #define LW(psrc) \ 45 ( { \ 46 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ 47 uint32_t val_m; \ 48 \ 49 asm volatile ( \ 50 "lw %[val_m], %[psrc_lw_m] \n\t" \ 51 \ 52 : [val_m] "=r" (val_m) \ 53 : [psrc_lw_m] "m" (*psrc_lw_m) \ 54 ); \ 55 \ 56 val_m; \ 57 } ) 58 59 #define SH(val, pdst) \ 60 { \ 61 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ 62 uint16_t val_m = (val); \ 63 \ 64 asm volatile ( \ 65 "sh %[val_m], %[pdst_sh_m] \n\t" \ 66 \ 67 : [pdst_sh_m] "=m" (*pdst_sh_m) \ 68 : [val_m] "r" (val_m) \ 69 ); \ 70 } 71 72 #define SW(val, pdst) \ 73 { \ 74 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ 75 uint32_t val_m = (val); \ 76 \ 77 asm volatile ( \ 78 "sw %[val_m], %[pdst_sw_m] \n\t" \ 79 \ 80 : [pdst_sw_m] "=m" (*pdst_sw_m) \ 81 : [val_m] "r" (val_m) \ 82 ); \ 83 } 84 85 #if (__mips == 64) 86 #define SD(val, pdst) \ 87 { \ 88 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 89 uint64_t val_m = (val); \ 90 \ 91 asm volatile ( \ 92 "sd %[val_m], %[pdst_sd_m] \n\t" \ 93 \ 94 : [pdst_sd_m] "=m" (*pdst_sd_m) \ 95 : [val_m] "r" (val_m) \ 96 ); \ 97 } 98 #else 99 #define SD(val, pdst) \ 100 { \ 101 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 102 uint32_t val0_m, val1_m; \ 103 \ 104 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ 105 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ 106 \ 107 SW(val0_m, pdst_sd_m); \ 108 SW(val1_m, pdst_sd_m + 4); \ 109 } 110 #endif 111 #else 112 #define MSA_SRLI_B(a, b) (a >> b) 113 114 #if (__mips_isa_rev >= 6) 115 #define LW(psrc) \ 116 ( { \ 117 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ 118 uint32_t val_m; \ 119 \ 120 asm volatile ( \ 121 "lw %[val_m], %[psrc_lw_m] \n\t" \ 122 \ 123 : [val_m] "=r" (val_m) \ 124 : [psrc_lw_m] "m" (*psrc_lw_m) \ 125 ); \ 126 \ 127 val_m; \ 128 } ) 129 130 #define SH(val, pdst) \ 131 { \ 132 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ 133 uint16_t val_m = (val); \ 134 \ 135 asm volatile ( \ 136 "sh %[val_m], %[pdst_sh_m] \n\t" \ 137 \ 138 : [pdst_sh_m] "=m" (*pdst_sh_m) \ 139 : [val_m] "r" (val_m) \ 140 ); \ 141 } 142 143 #define SW(val, pdst) \ 144 { \ 145 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ 146 uint32_t val_m = (val); \ 147 \ 148 asm volatile ( \ 149 "sw %[val_m], %[pdst_sw_m] \n\t" \ 150 \ 151 : [pdst_sw_m] "=m" (*pdst_sw_m) \ 152 : [val_m] "r" (val_m) \ 153 ); \ 154 } 155 156 #if (__mips == 64) 157 #define SD(val, pdst) \ 158 { \ 159 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 160 uint64_t val_m = (val); \ 161 \ 162 asm volatile ( \ 163 "sd %[val_m], %[pdst_sd_m] \n\t" \ 164 \ 165 : [pdst_sd_m] "=m" (*pdst_sd_m) \ 166 : [val_m] "r" (val_m) \ 167 ); \ 168 } 169 #else 170 #define SD(val, pdst) \ 171 { \ 172 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 173 uint32_t val0_m, val1_m; \ 174 \ 175 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ 176 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ 177 \ 178 SW(val0_m, pdst_sd_m); \ 179 SW(val1_m, pdst_sd_m + 4); \ 180 } 181 #endif 182 #else // !(__mips_isa_rev >= 6) 183 #define LW(psrc) \ 184 ( { \ 185 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ 186 uint32_t val_m; \ 187 \ 188 asm volatile ( \ 189 "ulw %[val_m], %[psrc_lw_m] \n\t" \ 190 \ 191 : [val_m] "=r" (val_m) \ 192 : [psrc_lw_m] "m" (*psrc_lw_m) \ 193 ); \ 194 \ 195 val_m; \ 196 } ) 197 198 #define SH(val, pdst) \ 199 { \ 200 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ 201 uint16_t val_m = (val); \ 202 \ 203 asm volatile ( \ 204 "ush %[val_m], %[pdst_sh_m] \n\t" \ 205 \ 206 : [pdst_sh_m] "=m" (*pdst_sh_m) \ 207 : [val_m] "r" (val_m) \ 208 ); \ 209 } 210 211 #define SW(val, pdst) \ 212 { \ 213 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ 214 uint32_t val_m = (val); \ 215 \ 216 asm volatile ( \ 217 "usw %[val_m], %[pdst_sw_m] \n\t" \ 218 \ 219 : [pdst_sw_m] "=m" (*pdst_sw_m) \ 220 : [val_m] "r" (val_m) \ 221 ); \ 222 } 223 224 #define SD(val, pdst) \ 225 { \ 226 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 227 uint32_t val0_m, val1_m; \ 228 \ 229 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ 230 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ 231 \ 232 SW(val0_m, pdst_sd_m); \ 233 SW(val1_m, pdst_sd_m + 4); \ 234 } 235 236 #define SW_ZERO(pdst) \ 237 { \ 238 uint8_t *pdst_m = (uint8_t *) (pdst); \ 239 \ 240 asm volatile ( \ 241 "usw $0, %[pdst_m] \n\t" \ 242 \ 243 : [pdst_m] "=m" (*pdst_m) \ 244 : \ 245 ); \ 246 } 247 #endif // (__mips_isa_rev >= 6) 248 #endif 249 250 #define LD_B(RTYPE, psrc) *((RTYPE *) (psrc)) 251 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 252 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 253 { \ 254 out0 = LD_B(RTYPE, (psrc)); \ 255 out1 = LD_B(RTYPE, (psrc) + stride); \ 256 } 257 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 258 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 259 { \ 260 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 261 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ 262 } 263 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 264 265 #define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in) 266 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 267 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 268 { \ 269 ST_B(RTYPE, in0, (pdst)); \ 270 ST_B(RTYPE, in1, (pdst) + stride); \ 271 } 272 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 273 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 274 { \ 275 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 276 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 277 } 278 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 279 280 #define ADD2(in0, in1, in2, in3, out0, out1) \ 281 { \ 282 out0 = in0 + in1; \ 283 out1 = in2 + in3; \ 284 } 285 #define ADD3(in0, in1, in2, in3, in4, in5, \ 286 out0, out1, out2) \ 287 { \ 288 ADD2(in0, in1, in2, in3, out0, out1); \ 289 out2 = in4 + in5; \ 290 } 291 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ 292 out0, out1, out2, out3) \ 293 { \ 294 ADD2(in0, in1, in2, in3, out0, out1); \ 295 ADD2(in4, in5, in6, in7, out2, out3); \ 296 } 297 298 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 299 { \ 300 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ 301 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \ 302 } 303 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 304 305 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 306 { \ 307 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \ 308 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \ 309 } 310 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 311 312 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 313 { \ 314 v16i8 zero_m = { 0 }; \ 315 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \ 316 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \ 317 } 318 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) 319 320 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \ 321 { \ 322 v16i8 zero_m = { 0 }; \ 323 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 324 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \ 325 } 326 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__) 327 328 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 329 { \ 330 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \ 331 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \ 332 } 333 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) 334 335 #define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2) \ 336 { \ 337 RTYPE zero = {0}; \ 338 \ 339 out0 = __msa_add_a_h((v8i16) zero, in0); \ 340 out1 = __msa_add_a_h((v8i16) zero, in1); \ 341 out2 = __msa_add_a_h((v8i16) zero, in2); \ 342 } 343 #define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__) 344 345 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 346 { \ 347 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ 348 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ 349 } 350 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 351 352 #define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0) \ 353 { \ 354 v8i16 _sel_h0, _sel_h1; \ 355 v16u8 _sel_b0, _sel_b1; \ 356 _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0); \ 357 _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0); \ 358 inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0); \ 359 inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0); \ 360 _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0); \ 361 _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1); \ 362 inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1); \ 363 out0 += inp4; \ 364 } 365 366 void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row, 367 png_const_bytep prev_row) 368 { 369 png_size_t i, cnt, cnt16, cnt32; 370 png_size_t istop = row_info->rowbytes; 371 png_bytep rp = row; 372 png_const_bytep pp = prev_row; 373 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 374 375 for (i = 0; i < (istop >> 6); i++) 376 { 377 LD_UB4(rp, 16, src0, src1, src2, src3); 378 LD_UB4(pp, 16, src4, src5, src6, src7); 379 pp += 64; 380 381 ADD4(src0, src4, src1, src5, src2, src6, src3, src7, 382 src0, src1, src2, src3); 383 384 ST_UB4(src0, src1, src2, src3, rp, 16); 385 rp += 64; 386 } 387 388 if (istop & 0x3F) 389 { 390 cnt32 = istop & 0x20; 391 cnt16 = istop & 0x10; 392 cnt = istop & 0xF; 393 394 if(cnt32) 395 { 396 if (cnt16 && cnt) 397 { 398 LD_UB4(rp, 16, src0, src1, src2, src3); 399 LD_UB4(pp, 16, src4, src5, src6, src7); 400 401 ADD4(src0, src4, src1, src5, src2, src6, src3, src7, 402 src0, src1, src2, src3); 403 404 ST_UB4(src0, src1, src2, src3, rp, 16); 405 rp += 64; 406 } 407 else if (cnt16 || cnt) 408 { 409 LD_UB2(rp, 16, src0, src1); 410 LD_UB2(pp, 16, src4, src5); 411 pp += 32; 412 src2 = LD_UB(rp + 32); 413 src6 = LD_UB(pp); 414 415 ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2); 416 417 ST_UB2(src0, src1, rp, 16); 418 rp += 32; 419 ST_UB(src2, rp); 420 rp += 16; 421 } 422 else 423 { 424 LD_UB2(rp, 16, src0, src1); 425 LD_UB2(pp, 16, src4, src5); 426 427 ADD2(src0, src4, src1, src5, src0, src1); 428 429 ST_UB2(src0, src1, rp, 16); 430 rp += 32; 431 } 432 } 433 else if (cnt16 && cnt) 434 { 435 LD_UB2(rp, 16, src0, src1); 436 LD_UB2(pp, 16, src4, src5); 437 438 ADD2(src0, src4, src1, src5, src0, src1); 439 440 ST_UB2(src0, src1, rp, 16); 441 rp += 32; 442 } 443 else if (cnt16 || cnt) 444 { 445 src0 = LD_UB(rp); 446 src4 = LD_UB(pp); 447 pp += 16; 448 449 src0 += src4; 450 451 ST_UB(src0, rp); 452 rp += 16; 453 } 454 } 455 } 456 457 void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row, 458 png_const_bytep prev_row) 459 { 460 png_size_t count; 461 png_size_t istop = row_info->rowbytes; 462 png_bytep src = row; 463 png_bytep nxt = row + 4; 464 int32_t inp0; 465 v16u8 src0, src1, src2, src3, src4; 466 v16u8 dst0, dst1; 467 v16u8 zero = { 0 }; 468 469 istop -= 4; 470 471 inp0 = LW(src); 472 src += 4; 473 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); 474 475 for (count = 0; count < istop; count += 16) 476 { 477 src1 = LD_UB(src); 478 src += 16; 479 480 src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4); 481 src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8); 482 src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12); 483 src1 += src0; 484 src2 += src1; 485 src3 += src2; 486 src4 += src3; 487 src0 = src4; 488 ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1); 489 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); 490 491 ST_UB(dst0, nxt); 492 nxt += 16; 493 } 494 } 495 496 void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row, 497 png_const_bytep prev_row) 498 { 499 png_size_t count; 500 png_size_t istop = row_info->rowbytes; 501 png_bytep src = row; 502 png_bytep nxt = row + 3; 503 int64_t out0; 504 int32_t inp0, out1; 505 v16u8 src0, src1, src2, src3, src4, dst0, dst1; 506 v16u8 zero = { 0 }; 507 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 508 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; 509 510 istop -= 3; 511 512 inp0 = LW(src); 513 src += 3; 514 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); 515 516 for (count = 0; count < istop; count += 12) 517 { 518 src1 = LD_UB(src); 519 src += 12; 520 521 src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3); 522 src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6); 523 src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9); 524 src1 += src0; 525 src2 += src1; 526 src3 += src2; 527 src4 += src3; 528 src0 = src4; 529 VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1); 530 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); 531 out0 = __msa_copy_s_d((v2i64) dst0, 0); 532 out1 = __msa_copy_s_w((v4i32) dst0, 2); 533 534 SD(out0, nxt); 535 nxt += 8; 536 SW(out1, nxt); 537 nxt += 4; 538 } 539 } 540 541 void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row, 542 png_const_bytep prev_row) 543 { 544 png_size_t i; 545 png_bytep src = row; 546 png_bytep nxt = row; 547 png_const_bytep pp = prev_row; 548 png_size_t istop = row_info->rowbytes - 4; 549 int32_t inp0, inp1, out0; 550 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; 551 v16u8 zero = { 0 }; 552 553 inp0 = LW(pp); 554 pp += 4; 555 inp1 = LW(src); 556 src += 4; 557 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); 558 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); 559 src0 = (v16u8) MSA_SRLI_B(src0, 1); 560 src1 += src0; 561 out0 = __msa_copy_s_w((v4i32) src1, 0); 562 SW(out0, nxt); 563 nxt += 4; 564 565 for (i = 0; i < istop; i += 16) 566 { 567 src2 = LD_UB(pp); 568 pp += 16; 569 src6 = LD_UB(src); 570 src += 16; 571 572 SLDI_B2_0_UB(src2, src6, src3, src7, 4); 573 SLDI_B2_0_UB(src2, src6, src4, src8, 8); 574 SLDI_B2_0_UB(src2, src6, src5, src9, 12); 575 src2 = __msa_ave_u_b(src2, src1); 576 src6 += src2; 577 src3 = __msa_ave_u_b(src3, src6); 578 src7 += src3; 579 src4 = __msa_ave_u_b(src4, src7); 580 src8 += src4; 581 src5 = __msa_ave_u_b(src5, src8); 582 src9 += src5; 583 src1 = src9; 584 ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1); 585 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); 586 587 ST_UB(dst0, nxt); 588 nxt += 16; 589 } 590 } 591 592 void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row, 593 png_const_bytep prev_row) 594 { 595 png_size_t i; 596 png_bytep src = row; 597 png_bytep nxt = row; 598 png_const_bytep pp = prev_row; 599 png_size_t istop = row_info->rowbytes - 3; 600 int64_t out0; 601 int32_t inp0, inp1, out1; 602 int16_t out2; 603 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; 604 v16u8 zero = { 0 }; 605 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 606 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; 607 608 inp0 = LW(pp); 609 pp += 3; 610 inp1 = LW(src); 611 src += 3; 612 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); 613 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); 614 src0 = (v16u8) MSA_SRLI_B(src0, 1); 615 src1 += src0; 616 out2 = __msa_copy_s_h((v8i16) src1, 0); 617 SH(out2, nxt); 618 nxt += 2; 619 nxt[0] = src1[2]; 620 nxt++; 621 622 for (i = 0; i < istop; i += 12) 623 { 624 src2 = LD_UB(pp); 625 pp += 12; 626 src6 = LD_UB(src); 627 src += 12; 628 629 SLDI_B2_0_UB(src2, src6, src3, src7, 3); 630 SLDI_B2_0_UB(src2, src6, src4, src8, 6); 631 SLDI_B2_0_UB(src2, src6, src5, src9, 9); 632 src2 = __msa_ave_u_b(src2, src1); 633 src6 += src2; 634 src3 = __msa_ave_u_b(src3, src6); 635 src7 += src3; 636 src4 = __msa_ave_u_b(src4, src7); 637 src8 += src4; 638 src5 = __msa_ave_u_b(src5, src8); 639 src9 += src5; 640 src1 = src9; 641 VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1); 642 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); 643 out0 = __msa_copy_s_d((v2i64) dst0, 0); 644 out1 = __msa_copy_s_w((v4i32) dst0, 2); 645 646 SD(out0, nxt); 647 nxt += 8; 648 SW(out1, nxt); 649 nxt += 4; 650 } 651 } 652 653 void png_read_filter_row_paeth4_msa(png_row_infop row_info, 654 png_bytep row, 655 png_const_bytep prev_row) 656 { 657 int32_t count, rp_end; 658 png_bytep nxt; 659 png_const_bytep prev_nxt; 660 int32_t inp0, inp1, res0; 661 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 662 v16u8 src10, src11, src12, src13, dst0, dst1; 663 v8i16 vec0, vec1, vec2; 664 v16u8 zero = { 0 }; 665 666 nxt = row; 667 prev_nxt = prev_row; 668 669 inp0 = LW(nxt); 670 inp1 = LW(prev_nxt); 671 prev_nxt += 4; 672 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); 673 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); 674 675 src1 += src0; 676 res0 = __msa_copy_s_w((v4i32) src1, 0); 677 678 SW(res0, nxt); 679 nxt += 4; 680 681 /* Remainder */ 682 rp_end = row_info->rowbytes - 4; 683 684 for (count = 0; count < rp_end; count += 16) 685 { 686 src2 = LD_UB(prev_nxt); 687 prev_nxt += 16; 688 src6 = LD_UB(prev_row); 689 prev_row += 16; 690 src10 = LD_UB(nxt); 691 692 SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4); 693 SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8); 694 SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12); 695 ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1); 696 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 697 vec2 = vec0 + vec1; 698 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 699 CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10); 700 ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1); 701 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 702 vec2 = vec0 + vec1; 703 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 704 CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11); 705 ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1); 706 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 707 vec2 = vec0 + vec1; 708 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 709 CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12); 710 ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1); 711 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 712 vec2 = vec0 + vec1; 713 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 714 CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13); 715 src1 = src13; 716 ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1); 717 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0); 718 719 ST_UB(dst0, nxt); 720 nxt += 16; 721 } 722 } 723 724 void png_read_filter_row_paeth3_msa(png_row_infop row_info, 725 png_bytep row, 726 png_const_bytep prev_row) 727 { 728 int32_t count, rp_end; 729 png_bytep nxt; 730 png_const_bytep prev_nxt; 731 int64_t out0; 732 int32_t inp0, inp1, out1; 733 int16_t out2; 734 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1; 735 v16u8 src10, src11, src12, src13; 736 v8i16 vec0, vec1, vec2; 737 v16u8 zero = { 0 }; 738 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 739 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 }; 740 741 nxt = row; 742 prev_nxt = prev_row; 743 744 inp0 = LW(nxt); 745 inp1 = LW(prev_nxt); 746 prev_nxt += 3; 747 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0); 748 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1); 749 750 src1 += src0; 751 out2 = __msa_copy_s_h((v8i16) src1, 0); 752 753 SH(out2, nxt); 754 nxt += 2; 755 nxt[0] = src1[2]; 756 nxt++; 757 758 /* Remainder */ 759 rp_end = row_info->rowbytes - 3; 760 761 for (count = 0; count < rp_end; count += 12) 762 { 763 src2 = LD_UB(prev_nxt); 764 prev_nxt += 12; 765 src6 = LD_UB(prev_row); 766 prev_row += 12; 767 src10 = LD_UB(nxt); 768 769 SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3); 770 SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6); 771 SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9); 772 ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1); 773 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 774 vec2 = vec0 + vec1; 775 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 776 CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10); 777 ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1); 778 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 779 vec2 = vec0 + vec1; 780 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 781 CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11); 782 ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1); 783 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 784 vec2 = vec0 + vec1; 785 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 786 CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12); 787 ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1); 788 HSUB_UB2_SH(vec0, vec1, vec0, vec1); 789 vec2 = vec0 + vec1; 790 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2); 791 CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13); 792 src1 = src13; 793 VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1); 794 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0); 795 out0 = __msa_copy_s_d((v2i64) dst0, 0); 796 out1 = __msa_copy_s_w((v4i32) dst0, 2); 797 798 SD(out0, nxt); 799 nxt += 8; 800 SW(out1, nxt); 801 nxt += 4; 802 } 803 } 804 805 #endif /* PNG_MIPS_MSA_OPT > 0 */ 806 #endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */ 807 #endif /* READ */ 808