1 /* 2 * Copyright 2007 Luca Barbato 3 * 4 * Permission to use, copy, modify, distribute, and sell this software and its 5 * documentation for any purpose is hereby granted without fee, provided that 6 * the above copyright notice appear in all copies and that both that 7 * copyright notice and this permission notice appear in supporting 8 * documentation, and that the name of Luca Barbato not be used in advertising or 9 * publicity pertaining to distribution of the software without specific, 10 * written prior permission. Luca Barbato makes no representations about the 11 * suitability of this software for any purpose. It is provided "as is" 12 * without express or implied warranty. 13 * 14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 21 * SOFTWARE. 22 * 23 * Author: Luca Barbato (lu_zero (at) gentoo.org) 24 * 25 * Based on fbmmx.c by Owen Taylor, Sren Sandmann and Nicholas Miell 26 */ 27 28 #include <config.h> 29 #include "pixman-private.h" 30 #include "pixman-combine32.h" 31 #include <altivec.h> 32 33 #define AVV(x...) {x} 34 35 static force_inline vector unsigned int 36 splat_alpha (vector unsigned int pix) 37 { 38 return vec_perm (pix, pix, 39 (vector unsigned char)AVV ( 40 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 41 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C)); 42 } 43 44 static force_inline vector unsigned int 45 pix_multiply (vector unsigned int p, vector unsigned int a) 46 { 47 vector unsigned short hi, lo, mod; 48 49 /* unpack to short */ 50 hi = (vector unsigned short) 51 vec_mergeh ((vector unsigned char)AVV (0), 52 (vector unsigned char)p); 53 54 mod = (vector unsigned short) 55 vec_mergeh ((vector unsigned char)AVV (0), 56 (vector unsigned char)a); 57 58 hi = vec_mladd (hi, mod, (vector unsigned short) 59 AVV (0x0080, 0x0080, 0x0080, 0x0080, 60 0x0080, 0x0080, 0x0080, 0x0080)); 61 62 hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8))); 63 64 hi = vec_sr (hi, vec_splat_u16 (8)); 65 66 /* unpack to short */ 67 lo = (vector unsigned short) 68 vec_mergel ((vector unsigned char)AVV (0), 69 (vector unsigned char)p); 70 mod = (vector unsigned short) 71 vec_mergel ((vector unsigned char)AVV (0), 72 (vector unsigned char)a); 73 74 lo = vec_mladd (lo, mod, (vector unsigned short) 75 AVV (0x0080, 0x0080, 0x0080, 0x0080, 76 0x0080, 0x0080, 0x0080, 0x0080)); 77 78 lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8))); 79 80 lo = vec_sr (lo, vec_splat_u16 (8)); 81 82 return (vector unsigned int)vec_packsu (hi, lo); 83 } 84 85 static force_inline vector unsigned int 86 pix_add (vector unsigned int a, vector unsigned int b) 87 { 88 return (vector unsigned int)vec_adds ((vector unsigned char)a, 89 (vector unsigned char)b); 90 } 91 92 static force_inline vector unsigned int 93 pix_add_mul (vector unsigned int x, 94 vector unsigned int a, 95 vector unsigned int y, 96 vector unsigned int b) 97 { 98 vector unsigned int t1, t2; 99 100 t1 = pix_multiply (x, a); 101 t2 = pix_multiply (y, b); 102 103 return pix_add (t1, t2); 104 } 105 106 static force_inline vector unsigned int 107 negate (vector unsigned int src) 108 { 109 return vec_nor (src, src); 110 } 111 112 /* dest*~srca + src */ 113 static force_inline vector unsigned int 114 over (vector unsigned int src, 115 vector unsigned int srca, 116 vector unsigned int dest) 117 { 118 vector unsigned char tmp = (vector unsigned char) 119 pix_multiply (dest, negate (srca)); 120 121 tmp = vec_adds ((vector unsigned char)src, tmp); 122 return (vector unsigned int)tmp; 123 } 124 125 /* in == pix_multiply */ 126 #define in_over(src, srca, mask, dest) \ 127 over (pix_multiply (src, mask), \ 128 pix_multiply (srca, mask), dest) 129 130 131 #define COMPUTE_SHIFT_MASK(source) \ 132 source ## _mask = vec_lvsl (0, source); 133 134 #define COMPUTE_SHIFT_MASKS(dest, source) \ 135 dest ## _mask = vec_lvsl (0, dest); \ 136 source ## _mask = vec_lvsl (0, source); \ 137 store_mask = vec_lvsr (0, dest); 138 139 #define COMPUTE_SHIFT_MASKC(dest, source, mask) \ 140 mask ## _mask = vec_lvsl (0, mask); \ 141 dest ## _mask = vec_lvsl (0, dest); \ 142 source ## _mask = vec_lvsl (0, source); \ 143 store_mask = vec_lvsr (0, dest); 144 145 /* notice you have to declare temp vars... 146 * Note: tmp3 and tmp4 must remain untouched! 147 */ 148 149 #define LOAD_VECTORS(dest, source) \ 150 tmp1 = (typeof(tmp1))vec_ld (0, source); \ 151 tmp2 = (typeof(tmp2))vec_ld (15, source); \ 152 tmp3 = (typeof(tmp3))vec_ld (0, dest); \ 153 v ## source = (typeof(v ## source)) \ 154 vec_perm (tmp1, tmp2, source ## _mask); \ 155 tmp4 = (typeof(tmp4))vec_ld (15, dest); \ 156 v ## dest = (typeof(v ## dest)) \ 157 vec_perm (tmp3, tmp4, dest ## _mask); 158 159 #define LOAD_VECTORSC(dest, source, mask) \ 160 tmp1 = (typeof(tmp1))vec_ld (0, source); \ 161 tmp2 = (typeof(tmp2))vec_ld (15, source); \ 162 tmp3 = (typeof(tmp3))vec_ld (0, dest); \ 163 v ## source = (typeof(v ## source)) \ 164 vec_perm (tmp1, tmp2, source ## _mask); \ 165 tmp4 = (typeof(tmp4))vec_ld (15, dest); \ 166 tmp1 = (typeof(tmp1))vec_ld (0, mask); \ 167 v ## dest = (typeof(v ## dest)) \ 168 vec_perm (tmp3, tmp4, dest ## _mask); \ 169 tmp2 = (typeof(tmp2))vec_ld (15, mask); \ 170 v ## mask = (typeof(v ## mask)) \ 171 vec_perm (tmp1, tmp2, mask ## _mask); 172 173 #define LOAD_VECTORSM(dest, source, mask) \ 174 LOAD_VECTORSC (dest, source, mask) \ 175 v ## source = pix_multiply (v ## source, \ 176 splat_alpha (v ## mask)); 177 178 #define STORE_VECTOR(dest) \ 179 edges = vec_perm (tmp4, tmp3, dest ## _mask); \ 180 tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \ 181 tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \ 182 vec_st ((vector unsigned int) tmp3, 15, dest); \ 183 vec_st ((vector unsigned int) tmp1, 0, dest); 184 185 static void 186 vmx_combine_over_u_no_mask (uint32_t * dest, 187 const uint32_t *src, 188 int width) 189 { 190 int i; 191 vector unsigned int vdest, vsrc; 192 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 193 dest_mask, src_mask, store_mask; 194 195 COMPUTE_SHIFT_MASKS (dest, src); 196 197 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 198 for (i = width / 4; i > 0; i--) 199 { 200 201 LOAD_VECTORS (dest, src); 202 203 vdest = over (vsrc, splat_alpha (vsrc), vdest); 204 205 STORE_VECTOR (dest); 206 207 src += 4; 208 dest += 4; 209 } 210 211 for (i = width % 4; --i >= 0;) 212 { 213 uint32_t s = src[i]; 214 uint32_t d = dest[i]; 215 uint32_t ia = ALPHA_8 (~s); 216 217 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 218 219 dest[i] = d; 220 } 221 } 222 223 static void 224 vmx_combine_over_u_mask (uint32_t * dest, 225 const uint32_t *src, 226 const uint32_t *mask, 227 int width) 228 { 229 int i; 230 vector unsigned int vdest, vsrc, vmask; 231 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 232 dest_mask, src_mask, mask_mask, store_mask; 233 234 COMPUTE_SHIFT_MASKC (dest, src, mask); 235 236 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 237 for (i = width / 4; i > 0; i--) 238 { 239 LOAD_VECTORSM (dest, src, mask); 240 241 vdest = over (vsrc, splat_alpha (vsrc), vdest); 242 243 STORE_VECTOR (dest); 244 245 src += 4; 246 dest += 4; 247 mask += 4; 248 } 249 250 for (i = width % 4; --i >= 0;) 251 { 252 uint32_t m = ALPHA_8 (mask[i]); 253 uint32_t s = src[i]; 254 uint32_t d = dest[i]; 255 uint32_t ia; 256 257 UN8x4_MUL_UN8 (s, m); 258 259 ia = ALPHA_8 (~s); 260 261 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 262 dest[i] = d; 263 } 264 } 265 266 static void 267 vmx_combine_over_u (pixman_implementation_t *imp, 268 pixman_op_t op, 269 uint32_t * dest, 270 const uint32_t * src, 271 const uint32_t * mask, 272 int width) 273 { 274 if (mask) 275 vmx_combine_over_u_mask (dest, src, mask, width); 276 else 277 vmx_combine_over_u_no_mask (dest, src, width); 278 } 279 280 static void 281 vmx_combine_over_reverse_u_no_mask (uint32_t * dest, 282 const uint32_t *src, 283 int width) 284 { 285 int i; 286 vector unsigned int vdest, vsrc; 287 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 288 dest_mask, src_mask, store_mask; 289 290 COMPUTE_SHIFT_MASKS (dest, src); 291 292 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 293 for (i = width / 4; i > 0; i--) 294 { 295 296 LOAD_VECTORS (dest, src); 297 298 vdest = over (vdest, splat_alpha (vdest), vsrc); 299 300 STORE_VECTOR (dest); 301 302 src += 4; 303 dest += 4; 304 } 305 306 for (i = width % 4; --i >= 0;) 307 { 308 uint32_t s = src[i]; 309 uint32_t d = dest[i]; 310 uint32_t ia = ALPHA_8 (~dest[i]); 311 312 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); 313 dest[i] = s; 314 } 315 } 316 317 static void 318 vmx_combine_over_reverse_u_mask (uint32_t * dest, 319 const uint32_t *src, 320 const uint32_t *mask, 321 int width) 322 { 323 int i; 324 vector unsigned int vdest, vsrc, vmask; 325 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 326 dest_mask, src_mask, mask_mask, store_mask; 327 328 COMPUTE_SHIFT_MASKC (dest, src, mask); 329 330 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 331 for (i = width / 4; i > 0; i--) 332 { 333 334 LOAD_VECTORSM (dest, src, mask); 335 336 vdest = over (vdest, splat_alpha (vdest), vsrc); 337 338 STORE_VECTOR (dest); 339 340 src += 4; 341 dest += 4; 342 mask += 4; 343 } 344 345 for (i = width % 4; --i >= 0;) 346 { 347 uint32_t m = ALPHA_8 (mask[i]); 348 uint32_t s = src[i]; 349 uint32_t d = dest[i]; 350 uint32_t ia = ALPHA_8 (~dest[i]); 351 352 UN8x4_MUL_UN8 (s, m); 353 354 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); 355 dest[i] = s; 356 } 357 } 358 359 static void 360 vmx_combine_over_reverse_u (pixman_implementation_t *imp, 361 pixman_op_t op, 362 uint32_t * dest, 363 const uint32_t * src, 364 const uint32_t * mask, 365 int width) 366 { 367 if (mask) 368 vmx_combine_over_reverse_u_mask (dest, src, mask, width); 369 else 370 vmx_combine_over_reverse_u_no_mask (dest, src, width); 371 } 372 373 static void 374 vmx_combine_in_u_no_mask (uint32_t * dest, 375 const uint32_t *src, 376 int width) 377 { 378 int i; 379 vector unsigned int vdest, vsrc; 380 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 381 dest_mask, src_mask, store_mask; 382 383 COMPUTE_SHIFT_MASKS (dest, src); 384 385 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 386 for (i = width / 4; i > 0; i--) 387 { 388 LOAD_VECTORS (dest, src); 389 390 vdest = pix_multiply (vsrc, splat_alpha (vdest)); 391 392 STORE_VECTOR (dest); 393 394 src += 4; 395 dest += 4; 396 } 397 398 for (i = width % 4; --i >= 0;) 399 { 400 uint32_t s = src[i]; 401 uint32_t a = ALPHA_8 (dest[i]); 402 403 UN8x4_MUL_UN8 (s, a); 404 dest[i] = s; 405 } 406 } 407 408 static void 409 vmx_combine_in_u_mask (uint32_t * dest, 410 const uint32_t *src, 411 const uint32_t *mask, 412 int width) 413 { 414 int i; 415 vector unsigned int vdest, vsrc, vmask; 416 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 417 dest_mask, src_mask, mask_mask, store_mask; 418 419 COMPUTE_SHIFT_MASKC (dest, src, mask); 420 421 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 422 for (i = width / 4; i > 0; i--) 423 { 424 LOAD_VECTORSM (dest, src, mask); 425 426 vdest = pix_multiply (vsrc, splat_alpha (vdest)); 427 428 STORE_VECTOR (dest); 429 430 src += 4; 431 dest += 4; 432 mask += 4; 433 } 434 435 for (i = width % 4; --i >= 0;) 436 { 437 uint32_t m = ALPHA_8 (mask[i]); 438 uint32_t s = src[i]; 439 uint32_t a = ALPHA_8 (dest[i]); 440 441 UN8x4_MUL_UN8 (s, m); 442 UN8x4_MUL_UN8 (s, a); 443 444 dest[i] = s; 445 } 446 } 447 448 static void 449 vmx_combine_in_u (pixman_implementation_t *imp, 450 pixman_op_t op, 451 uint32_t * dest, 452 const uint32_t * src, 453 const uint32_t * mask, 454 int width) 455 { 456 if (mask) 457 vmx_combine_in_u_mask (dest, src, mask, width); 458 else 459 vmx_combine_in_u_no_mask (dest, src, width); 460 } 461 462 static void 463 vmx_combine_in_reverse_u_no_mask (uint32_t * dest, 464 const uint32_t *src, 465 int width) 466 { 467 int i; 468 vector unsigned int vdest, vsrc; 469 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 470 dest_mask, src_mask, store_mask; 471 472 COMPUTE_SHIFT_MASKS (dest, src); 473 474 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 475 for (i = width / 4; i > 0; i--) 476 { 477 LOAD_VECTORS (dest, src); 478 479 vdest = pix_multiply (vdest, splat_alpha (vsrc)); 480 481 STORE_VECTOR (dest); 482 483 src += 4; 484 dest += 4; 485 } 486 487 for (i = width % 4; --i >= 0;) 488 { 489 uint32_t d = dest[i]; 490 uint32_t a = ALPHA_8 (src[i]); 491 492 UN8x4_MUL_UN8 (d, a); 493 494 dest[i] = d; 495 } 496 } 497 498 static void 499 vmx_combine_in_reverse_u_mask (uint32_t * dest, 500 const uint32_t *src, 501 const uint32_t *mask, 502 int width) 503 { 504 int i; 505 vector unsigned int vdest, vsrc, vmask; 506 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 507 dest_mask, src_mask, mask_mask, store_mask; 508 509 COMPUTE_SHIFT_MASKC (dest, src, mask); 510 511 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 512 for (i = width / 4; i > 0; i--) 513 { 514 LOAD_VECTORSM (dest, src, mask); 515 516 vdest = pix_multiply (vdest, splat_alpha (vsrc)); 517 518 STORE_VECTOR (dest); 519 520 src += 4; 521 dest += 4; 522 mask += 4; 523 } 524 525 for (i = width % 4; --i >= 0;) 526 { 527 uint32_t m = ALPHA_8 (mask[i]); 528 uint32_t d = dest[i]; 529 uint32_t a = src[i]; 530 531 UN8x4_MUL_UN8 (a, m); 532 a = ALPHA_8 (a); 533 UN8x4_MUL_UN8 (d, a); 534 535 dest[i] = d; 536 } 537 } 538 539 static void 540 vmx_combine_in_reverse_u (pixman_implementation_t *imp, 541 pixman_op_t op, 542 uint32_t * dest, 543 const uint32_t * src, 544 const uint32_t * mask, 545 int width) 546 { 547 if (mask) 548 vmx_combine_in_reverse_u_mask (dest, src, mask, width); 549 else 550 vmx_combine_in_reverse_u_no_mask (dest, src, width); 551 } 552 553 static void 554 vmx_combine_out_u_no_mask (uint32_t * dest, 555 const uint32_t *src, 556 int width) 557 { 558 int i; 559 vector unsigned int vdest, vsrc; 560 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 561 dest_mask, src_mask, store_mask; 562 563 COMPUTE_SHIFT_MASKS (dest, src); 564 565 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 566 for (i = width / 4; i > 0; i--) 567 { 568 LOAD_VECTORS (dest, src); 569 570 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); 571 572 STORE_VECTOR (dest); 573 574 src += 4; 575 dest += 4; 576 } 577 578 for (i = width % 4; --i >= 0;) 579 { 580 uint32_t s = src[i]; 581 uint32_t a = ALPHA_8 (~dest[i]); 582 583 UN8x4_MUL_UN8 (s, a); 584 585 dest[i] = s; 586 } 587 } 588 589 static void 590 vmx_combine_out_u_mask (uint32_t * dest, 591 const uint32_t *src, 592 const uint32_t *mask, 593 int width) 594 { 595 int i; 596 vector unsigned int vdest, vsrc, vmask; 597 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 598 dest_mask, src_mask, mask_mask, store_mask; 599 600 COMPUTE_SHIFT_MASKC (dest, src, mask); 601 602 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 603 for (i = width / 4; i > 0; i--) 604 { 605 LOAD_VECTORSM (dest, src, mask); 606 607 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); 608 609 STORE_VECTOR (dest); 610 611 src += 4; 612 dest += 4; 613 mask += 4; 614 } 615 616 for (i = width % 4; --i >= 0;) 617 { 618 uint32_t m = ALPHA_8 (mask[i]); 619 uint32_t s = src[i]; 620 uint32_t a = ALPHA_8 (~dest[i]); 621 622 UN8x4_MUL_UN8 (s, m); 623 UN8x4_MUL_UN8 (s, a); 624 625 dest[i] = s; 626 } 627 } 628 629 static void 630 vmx_combine_out_u (pixman_implementation_t *imp, 631 pixman_op_t op, 632 uint32_t * dest, 633 const uint32_t * src, 634 const uint32_t * mask, 635 int width) 636 { 637 if (mask) 638 vmx_combine_out_u_mask (dest, src, mask, width); 639 else 640 vmx_combine_out_u_no_mask (dest, src, width); 641 } 642 643 static void 644 vmx_combine_out_reverse_u_no_mask (uint32_t * dest, 645 const uint32_t *src, 646 int width) 647 { 648 int i; 649 vector unsigned int vdest, vsrc; 650 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 651 dest_mask, src_mask, store_mask; 652 653 COMPUTE_SHIFT_MASKS (dest, src); 654 655 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 656 for (i = width / 4; i > 0; i--) 657 { 658 659 LOAD_VECTORS (dest, src); 660 661 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); 662 663 STORE_VECTOR (dest); 664 665 src += 4; 666 dest += 4; 667 } 668 669 for (i = width % 4; --i >= 0;) 670 { 671 uint32_t d = dest[i]; 672 uint32_t a = ALPHA_8 (~src[i]); 673 674 UN8x4_MUL_UN8 (d, a); 675 676 dest[i] = d; 677 } 678 } 679 680 static void 681 vmx_combine_out_reverse_u_mask (uint32_t * dest, 682 const uint32_t *src, 683 const uint32_t *mask, 684 int width) 685 { 686 int i; 687 vector unsigned int vdest, vsrc, vmask; 688 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 689 dest_mask, src_mask, mask_mask, store_mask; 690 691 COMPUTE_SHIFT_MASKC (dest, src, mask); 692 693 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 694 for (i = width / 4; i > 0; i--) 695 { 696 LOAD_VECTORSM (dest, src, mask); 697 698 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); 699 700 STORE_VECTOR (dest); 701 702 src += 4; 703 dest += 4; 704 mask += 4; 705 } 706 707 for (i = width % 4; --i >= 0;) 708 { 709 uint32_t m = ALPHA_8 (mask[i]); 710 uint32_t d = dest[i]; 711 uint32_t a = src[i]; 712 713 UN8x4_MUL_UN8 (a, m); 714 a = ALPHA_8 (~a); 715 UN8x4_MUL_UN8 (d, a); 716 717 dest[i] = d; 718 } 719 } 720 721 static void 722 vmx_combine_out_reverse_u (pixman_implementation_t *imp, 723 pixman_op_t op, 724 uint32_t * dest, 725 const uint32_t * src, 726 const uint32_t * mask, 727 int width) 728 { 729 if (mask) 730 vmx_combine_out_reverse_u_mask (dest, src, mask, width); 731 else 732 vmx_combine_out_reverse_u_no_mask (dest, src, width); 733 } 734 735 static void 736 vmx_combine_atop_u_no_mask (uint32_t * dest, 737 const uint32_t *src, 738 int width) 739 { 740 int i; 741 vector unsigned int vdest, vsrc; 742 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 743 dest_mask, src_mask, store_mask; 744 745 COMPUTE_SHIFT_MASKS (dest, src); 746 747 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 748 for (i = width / 4; i > 0; i--) 749 { 750 LOAD_VECTORS (dest, src); 751 752 vdest = pix_add_mul (vsrc, splat_alpha (vdest), 753 vdest, splat_alpha (negate (vsrc))); 754 755 STORE_VECTOR (dest); 756 757 src += 4; 758 dest += 4; 759 } 760 761 for (i = width % 4; --i >= 0;) 762 { 763 uint32_t s = src[i]; 764 uint32_t d = dest[i]; 765 uint32_t dest_a = ALPHA_8 (d); 766 uint32_t src_ia = ALPHA_8 (~s); 767 768 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); 769 770 dest[i] = s; 771 } 772 } 773 774 static void 775 vmx_combine_atop_u_mask (uint32_t * dest, 776 const uint32_t *src, 777 const uint32_t *mask, 778 int width) 779 { 780 int i; 781 vector unsigned int vdest, vsrc, vmask; 782 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 783 dest_mask, src_mask, mask_mask, store_mask; 784 785 COMPUTE_SHIFT_MASKC (dest, src, mask); 786 787 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 788 for (i = width / 4; i > 0; i--) 789 { 790 LOAD_VECTORSM (dest, src, mask); 791 792 vdest = pix_add_mul (vsrc, splat_alpha (vdest), 793 vdest, splat_alpha (negate (vsrc))); 794 795 STORE_VECTOR (dest); 796 797 src += 4; 798 dest += 4; 799 mask += 4; 800 } 801 802 for (i = width % 4; --i >= 0;) 803 { 804 uint32_t m = ALPHA_8 (mask[i]); 805 uint32_t s = src[i]; 806 uint32_t d = dest[i]; 807 uint32_t dest_a = ALPHA_8 (d); 808 uint32_t src_ia; 809 810 UN8x4_MUL_UN8 (s, m); 811 812 src_ia = ALPHA_8 (~s); 813 814 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); 815 816 dest[i] = s; 817 } 818 } 819 820 static void 821 vmx_combine_atop_u (pixman_implementation_t *imp, 822 pixman_op_t op, 823 uint32_t * dest, 824 const uint32_t * src, 825 const uint32_t * mask, 826 int width) 827 { 828 if (mask) 829 vmx_combine_atop_u_mask (dest, src, mask, width); 830 else 831 vmx_combine_atop_u_no_mask (dest, src, width); 832 } 833 834 static void 835 vmx_combine_atop_reverse_u_no_mask (uint32_t * dest, 836 const uint32_t *src, 837 int width) 838 { 839 int i; 840 vector unsigned int vdest, vsrc; 841 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 842 dest_mask, src_mask, store_mask; 843 844 COMPUTE_SHIFT_MASKS (dest, src); 845 846 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 847 for (i = width / 4; i > 0; i--) 848 { 849 LOAD_VECTORS (dest, src); 850 851 vdest = pix_add_mul (vdest, splat_alpha (vsrc), 852 vsrc, splat_alpha (negate (vdest))); 853 854 STORE_VECTOR (dest); 855 856 src += 4; 857 dest += 4; 858 } 859 860 for (i = width % 4; --i >= 0;) 861 { 862 uint32_t s = src[i]; 863 uint32_t d = dest[i]; 864 uint32_t src_a = ALPHA_8 (s); 865 uint32_t dest_ia = ALPHA_8 (~d); 866 867 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); 868 869 dest[i] = s; 870 } 871 } 872 873 static void 874 vmx_combine_atop_reverse_u_mask (uint32_t * dest, 875 const uint32_t *src, 876 const uint32_t *mask, 877 int width) 878 { 879 int i; 880 vector unsigned int vdest, vsrc, vmask; 881 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 882 dest_mask, src_mask, mask_mask, store_mask; 883 884 COMPUTE_SHIFT_MASKC (dest, src, mask); 885 886 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 887 for (i = width / 4; i > 0; i--) 888 { 889 LOAD_VECTORSM (dest, src, mask); 890 891 vdest = pix_add_mul (vdest, splat_alpha (vsrc), 892 vsrc, splat_alpha (negate (vdest))); 893 894 STORE_VECTOR (dest); 895 896 src += 4; 897 dest += 4; 898 mask += 4; 899 } 900 901 for (i = width % 4; --i >= 0;) 902 { 903 uint32_t m = ALPHA_8 (mask[i]); 904 uint32_t s = src[i]; 905 uint32_t d = dest[i]; 906 uint32_t src_a; 907 uint32_t dest_ia = ALPHA_8 (~d); 908 909 UN8x4_MUL_UN8 (s, m); 910 911 src_a = ALPHA_8 (s); 912 913 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); 914 915 dest[i] = s; 916 } 917 } 918 919 static void 920 vmx_combine_atop_reverse_u (pixman_implementation_t *imp, 921 pixman_op_t op, 922 uint32_t * dest, 923 const uint32_t * src, 924 const uint32_t * mask, 925 int width) 926 { 927 if (mask) 928 vmx_combine_atop_reverse_u_mask (dest, src, mask, width); 929 else 930 vmx_combine_atop_reverse_u_no_mask (dest, src, width); 931 } 932 933 static void 934 vmx_combine_xor_u_no_mask (uint32_t * dest, 935 const uint32_t *src, 936 int width) 937 { 938 int i; 939 vector unsigned int vdest, vsrc; 940 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 941 dest_mask, src_mask, store_mask; 942 943 COMPUTE_SHIFT_MASKS (dest, src); 944 945 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 946 for (i = width / 4; i > 0; i--) 947 { 948 LOAD_VECTORS (dest, src); 949 950 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), 951 vdest, splat_alpha (negate (vsrc))); 952 953 STORE_VECTOR (dest); 954 955 src += 4; 956 dest += 4; 957 } 958 959 for (i = width % 4; --i >= 0;) 960 { 961 uint32_t s = src[i]; 962 uint32_t d = dest[i]; 963 uint32_t src_ia = ALPHA_8 (~s); 964 uint32_t dest_ia = ALPHA_8 (~d); 965 966 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); 967 968 dest[i] = s; 969 } 970 } 971 972 static void 973 vmx_combine_xor_u_mask (uint32_t * dest, 974 const uint32_t *src, 975 const uint32_t *mask, 976 int width) 977 { 978 int i; 979 vector unsigned int vdest, vsrc, vmask; 980 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 981 dest_mask, src_mask, mask_mask, store_mask; 982 983 COMPUTE_SHIFT_MASKC (dest, src, mask); 984 985 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 986 for (i = width / 4; i > 0; i--) 987 { 988 LOAD_VECTORSM (dest, src, mask); 989 990 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), 991 vdest, splat_alpha (negate (vsrc))); 992 993 STORE_VECTOR (dest); 994 995 src += 4; 996 dest += 4; 997 mask += 4; 998 } 999 1000 for (i = width % 4; --i >= 0;) 1001 { 1002 uint32_t m = ALPHA_8 (mask[i]); 1003 uint32_t s = src[i]; 1004 uint32_t d = dest[i]; 1005 uint32_t src_ia; 1006 uint32_t dest_ia = ALPHA_8 (~d); 1007 1008 UN8x4_MUL_UN8 (s, m); 1009 1010 src_ia = ALPHA_8 (~s); 1011 1012 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); 1013 1014 dest[i] = s; 1015 } 1016 } 1017 1018 static void 1019 vmx_combine_xor_u (pixman_implementation_t *imp, 1020 pixman_op_t op, 1021 uint32_t * dest, 1022 const uint32_t * src, 1023 const uint32_t * mask, 1024 int width) 1025 { 1026 if (mask) 1027 vmx_combine_xor_u_mask (dest, src, mask, width); 1028 else 1029 vmx_combine_xor_u_no_mask (dest, src, width); 1030 } 1031 1032 static void 1033 vmx_combine_add_u_no_mask (uint32_t * dest, 1034 const uint32_t *src, 1035 int width) 1036 { 1037 int i; 1038 vector unsigned int vdest, vsrc; 1039 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1040 dest_mask, src_mask, store_mask; 1041 1042 COMPUTE_SHIFT_MASKS (dest, src); 1043 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1044 for (i = width / 4; i > 0; i--) 1045 { 1046 LOAD_VECTORS (dest, src); 1047 1048 vdest = pix_add (vsrc, vdest); 1049 1050 STORE_VECTOR (dest); 1051 1052 src += 4; 1053 dest += 4; 1054 } 1055 1056 for (i = width % 4; --i >= 0;) 1057 { 1058 uint32_t s = src[i]; 1059 uint32_t d = dest[i]; 1060 1061 UN8x4_ADD_UN8x4 (d, s); 1062 1063 dest[i] = d; 1064 } 1065 } 1066 1067 static void 1068 vmx_combine_add_u_mask (uint32_t * dest, 1069 const uint32_t *src, 1070 const uint32_t *mask, 1071 int width) 1072 { 1073 int i; 1074 vector unsigned int vdest, vsrc, vmask; 1075 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1076 dest_mask, src_mask, mask_mask, store_mask; 1077 1078 COMPUTE_SHIFT_MASKC (dest, src, mask); 1079 1080 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1081 for (i = width / 4; i > 0; i--) 1082 { 1083 LOAD_VECTORSM (dest, src, mask); 1084 1085 vdest = pix_add (vsrc, vdest); 1086 1087 STORE_VECTOR (dest); 1088 1089 src += 4; 1090 dest += 4; 1091 mask += 4; 1092 } 1093 1094 for (i = width % 4; --i >= 0;) 1095 { 1096 uint32_t m = ALPHA_8 (mask[i]); 1097 uint32_t s = src[i]; 1098 uint32_t d = dest[i]; 1099 1100 UN8x4_MUL_UN8 (s, m); 1101 UN8x4_ADD_UN8x4 (d, s); 1102 1103 dest[i] = d; 1104 } 1105 } 1106 1107 static void 1108 vmx_combine_add_u (pixman_implementation_t *imp, 1109 pixman_op_t op, 1110 uint32_t * dest, 1111 const uint32_t * src, 1112 const uint32_t * mask, 1113 int width) 1114 { 1115 if (mask) 1116 vmx_combine_add_u_mask (dest, src, mask, width); 1117 else 1118 vmx_combine_add_u_no_mask (dest, src, width); 1119 } 1120 1121 static void 1122 vmx_combine_src_ca (pixman_implementation_t *imp, 1123 pixman_op_t op, 1124 uint32_t * dest, 1125 const uint32_t * src, 1126 const uint32_t * mask, 1127 int width) 1128 { 1129 int i; 1130 vector unsigned int vdest, vsrc, vmask; 1131 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1132 dest_mask, mask_mask, src_mask, store_mask; 1133 1134 COMPUTE_SHIFT_MASKC (dest, src, mask); 1135 1136 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1137 for (i = width / 4; i > 0; i--) 1138 { 1139 LOAD_VECTORSC (dest, src, mask); 1140 1141 vdest = pix_multiply (vsrc, vmask); 1142 1143 STORE_VECTOR (dest); 1144 1145 mask += 4; 1146 src += 4; 1147 dest += 4; 1148 } 1149 1150 for (i = width % 4; --i >= 0;) 1151 { 1152 uint32_t a = mask[i]; 1153 uint32_t s = src[i]; 1154 1155 UN8x4_MUL_UN8x4 (s, a); 1156 1157 dest[i] = s; 1158 } 1159 } 1160 1161 static void 1162 vmx_combine_over_ca (pixman_implementation_t *imp, 1163 pixman_op_t op, 1164 uint32_t * dest, 1165 const uint32_t * src, 1166 const uint32_t * mask, 1167 int width) 1168 { 1169 int i; 1170 vector unsigned int vdest, vsrc, vmask; 1171 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1172 dest_mask, mask_mask, src_mask, store_mask; 1173 1174 COMPUTE_SHIFT_MASKC (dest, src, mask); 1175 1176 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1177 for (i = width / 4; i > 0; i--) 1178 { 1179 LOAD_VECTORSC (dest, src, mask); 1180 1181 vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest); 1182 1183 STORE_VECTOR (dest); 1184 1185 mask += 4; 1186 src += 4; 1187 dest += 4; 1188 } 1189 1190 for (i = width % 4; --i >= 0;) 1191 { 1192 uint32_t a = mask[i]; 1193 uint32_t s = src[i]; 1194 uint32_t d = dest[i]; 1195 uint32_t sa = ALPHA_8 (s); 1196 1197 UN8x4_MUL_UN8x4 (s, a); 1198 UN8x4_MUL_UN8 (a, sa); 1199 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s); 1200 1201 dest[i] = d; 1202 } 1203 } 1204 1205 static void 1206 vmx_combine_over_reverse_ca (pixman_implementation_t *imp, 1207 pixman_op_t op, 1208 uint32_t * dest, 1209 const uint32_t * src, 1210 const uint32_t * mask, 1211 int width) 1212 { 1213 int i; 1214 vector unsigned int vdest, vsrc, vmask; 1215 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1216 dest_mask, mask_mask, src_mask, store_mask; 1217 1218 COMPUTE_SHIFT_MASKC (dest, src, mask); 1219 1220 /* printf("%s\n",__PRETTY_FUNCTION__); */ 1221 for (i = width / 4; i > 0; i--) 1222 { 1223 LOAD_VECTORSC (dest, src, mask); 1224 1225 vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask)); 1226 1227 STORE_VECTOR (dest); 1228 1229 mask += 4; 1230 src += 4; 1231 dest += 4; 1232 } 1233 1234 for (i = width % 4; --i >= 0;) 1235 { 1236 uint32_t a = mask[i]; 1237 uint32_t s = src[i]; 1238 uint32_t d = dest[i]; 1239 uint32_t ida = ALPHA_8 (~d); 1240 1241 UN8x4_MUL_UN8x4 (s, a); 1242 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d); 1243 1244 dest[i] = s; 1245 } 1246 } 1247 1248 static void 1249 vmx_combine_in_ca (pixman_implementation_t *imp, 1250 pixman_op_t op, 1251 uint32_t * dest, 1252 const uint32_t * src, 1253 const uint32_t * mask, 1254 int width) 1255 { 1256 int i; 1257 vector unsigned int vdest, vsrc, vmask; 1258 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1259 dest_mask, mask_mask, src_mask, store_mask; 1260 1261 COMPUTE_SHIFT_MASKC (dest, src, mask); 1262 1263 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1264 for (i = width / 4; i > 0; i--) 1265 { 1266 LOAD_VECTORSC (dest, src, mask); 1267 1268 vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest)); 1269 1270 STORE_VECTOR (dest); 1271 1272 src += 4; 1273 dest += 4; 1274 mask += 4; 1275 } 1276 1277 for (i = width % 4; --i >= 0;) 1278 { 1279 uint32_t a = mask[i]; 1280 uint32_t s = src[i]; 1281 uint32_t da = ALPHA_8 (dest[i]); 1282 1283 UN8x4_MUL_UN8x4 (s, a); 1284 UN8x4_MUL_UN8 (s, da); 1285 1286 dest[i] = s; 1287 } 1288 } 1289 1290 static void 1291 vmx_combine_in_reverse_ca (pixman_implementation_t *imp, 1292 pixman_op_t op, 1293 uint32_t * dest, 1294 const uint32_t * src, 1295 const uint32_t * mask, 1296 int width) 1297 { 1298 int i; 1299 vector unsigned int vdest, vsrc, vmask; 1300 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1301 dest_mask, mask_mask, src_mask, store_mask; 1302 1303 COMPUTE_SHIFT_MASKC (dest, src, mask); 1304 1305 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1306 for (i = width / 4; i > 0; i--) 1307 { 1308 1309 LOAD_VECTORSC (dest, src, mask); 1310 1311 vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc))); 1312 1313 STORE_VECTOR (dest); 1314 1315 src += 4; 1316 dest += 4; 1317 mask += 4; 1318 } 1319 1320 for (i = width % 4; --i >= 0;) 1321 { 1322 uint32_t a = mask[i]; 1323 uint32_t d = dest[i]; 1324 uint32_t sa = ALPHA_8 (src[i]); 1325 1326 UN8x4_MUL_UN8 (a, sa); 1327 UN8x4_MUL_UN8x4 (d, a); 1328 1329 dest[i] = d; 1330 } 1331 } 1332 1333 static void 1334 vmx_combine_out_ca (pixman_implementation_t *imp, 1335 pixman_op_t op, 1336 uint32_t * dest, 1337 const uint32_t * src, 1338 const uint32_t * mask, 1339 int width) 1340 { 1341 int i; 1342 vector unsigned int vdest, vsrc, vmask; 1343 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1344 dest_mask, mask_mask, src_mask, store_mask; 1345 1346 COMPUTE_SHIFT_MASKC (dest, src, mask); 1347 1348 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1349 for (i = width / 4; i > 0; i--) 1350 { 1351 LOAD_VECTORSC (dest, src, mask); 1352 1353 vdest = pix_multiply ( 1354 pix_multiply (vsrc, vmask), splat_alpha (negate (vdest))); 1355 1356 STORE_VECTOR (dest); 1357 1358 src += 4; 1359 dest += 4; 1360 mask += 4; 1361 } 1362 1363 for (i = width % 4; --i >= 0;) 1364 { 1365 uint32_t a = mask[i]; 1366 uint32_t s = src[i]; 1367 uint32_t d = dest[i]; 1368 uint32_t da = ALPHA_8 (~d); 1369 1370 UN8x4_MUL_UN8x4 (s, a); 1371 UN8x4_MUL_UN8 (s, da); 1372 1373 dest[i] = s; 1374 } 1375 } 1376 1377 static void 1378 vmx_combine_out_reverse_ca (pixman_implementation_t *imp, 1379 pixman_op_t op, 1380 uint32_t * dest, 1381 const uint32_t * src, 1382 const uint32_t * mask, 1383 int width) 1384 { 1385 int i; 1386 vector unsigned int vdest, vsrc, vmask; 1387 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1388 dest_mask, mask_mask, src_mask, store_mask; 1389 1390 COMPUTE_SHIFT_MASKC (dest, src, mask); 1391 1392 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1393 for (i = width / 4; i > 0; i--) 1394 { 1395 LOAD_VECTORSC (dest, src, mask); 1396 1397 vdest = pix_multiply ( 1398 vdest, negate (pix_multiply (vmask, splat_alpha (vsrc)))); 1399 1400 STORE_VECTOR (dest); 1401 1402 src += 4; 1403 dest += 4; 1404 mask += 4; 1405 } 1406 1407 for (i = width % 4; --i >= 0;) 1408 { 1409 uint32_t a = mask[i]; 1410 uint32_t s = src[i]; 1411 uint32_t d = dest[i]; 1412 uint32_t sa = ALPHA_8 (s); 1413 1414 UN8x4_MUL_UN8 (a, sa); 1415 UN8x4_MUL_UN8x4 (d, ~a); 1416 1417 dest[i] = d; 1418 } 1419 } 1420 1421 static void 1422 vmx_combine_atop_ca (pixman_implementation_t *imp, 1423 pixman_op_t op, 1424 uint32_t * dest, 1425 const uint32_t * src, 1426 const uint32_t * mask, 1427 int width) 1428 { 1429 int i; 1430 vector unsigned int vdest, vsrc, vmask, vsrca; 1431 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1432 dest_mask, mask_mask, src_mask, store_mask; 1433 1434 COMPUTE_SHIFT_MASKC (dest, src, mask); 1435 1436 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1437 for (i = width / 4; i > 0; i--) 1438 { 1439 LOAD_VECTORSC (dest, src, mask); 1440 1441 vsrca = splat_alpha (vsrc); 1442 1443 vsrc = pix_multiply (vsrc, vmask); 1444 vmask = pix_multiply (vmask, vsrca); 1445 1446 vdest = pix_add_mul (vsrc, splat_alpha (vdest), 1447 negate (vmask), vdest); 1448 1449 STORE_VECTOR (dest); 1450 1451 src += 4; 1452 dest += 4; 1453 mask += 4; 1454 } 1455 1456 for (i = width % 4; --i >= 0;) 1457 { 1458 uint32_t a = mask[i]; 1459 uint32_t s = src[i]; 1460 uint32_t d = dest[i]; 1461 uint32_t sa = ALPHA_8 (s); 1462 uint32_t da = ALPHA_8 (d); 1463 1464 UN8x4_MUL_UN8x4 (s, a); 1465 UN8x4_MUL_UN8 (a, sa); 1466 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); 1467 1468 dest[i] = d; 1469 } 1470 } 1471 1472 static void 1473 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp, 1474 pixman_op_t op, 1475 uint32_t * dest, 1476 const uint32_t * src, 1477 const uint32_t * mask, 1478 int width) 1479 { 1480 int i; 1481 vector unsigned int vdest, vsrc, vmask; 1482 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1483 dest_mask, mask_mask, src_mask, store_mask; 1484 1485 COMPUTE_SHIFT_MASKC (dest, src, mask); 1486 1487 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1488 for (i = width / 4; i > 0; i--) 1489 { 1490 LOAD_VECTORSC (dest, src, mask); 1491 1492 vdest = pix_add_mul (vdest, 1493 pix_multiply (vmask, splat_alpha (vsrc)), 1494 pix_multiply (vsrc, vmask), 1495 negate (splat_alpha (vdest))); 1496 1497 STORE_VECTOR (dest); 1498 1499 src += 4; 1500 dest += 4; 1501 mask += 4; 1502 } 1503 1504 for (i = width % 4; --i >= 0;) 1505 { 1506 uint32_t a = mask[i]; 1507 uint32_t s = src[i]; 1508 uint32_t d = dest[i]; 1509 uint32_t sa = ALPHA_8 (s); 1510 uint32_t da = ALPHA_8 (~d); 1511 1512 UN8x4_MUL_UN8x4 (s, a); 1513 UN8x4_MUL_UN8 (a, sa); 1514 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da); 1515 1516 dest[i] = d; 1517 } 1518 } 1519 1520 static void 1521 vmx_combine_xor_ca (pixman_implementation_t *imp, 1522 pixman_op_t op, 1523 uint32_t * dest, 1524 const uint32_t * src, 1525 const uint32_t * mask, 1526 int width) 1527 { 1528 int i; 1529 vector unsigned int vdest, vsrc, vmask; 1530 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1531 dest_mask, mask_mask, src_mask, store_mask; 1532 1533 COMPUTE_SHIFT_MASKC (dest, src, mask); 1534 1535 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1536 for (i = width / 4; i > 0; i--) 1537 { 1538 LOAD_VECTORSC (dest, src, mask); 1539 1540 vdest = pix_add_mul (vdest, 1541 negate (pix_multiply (vmask, splat_alpha (vsrc))), 1542 pix_multiply (vsrc, vmask), 1543 negate (splat_alpha (vdest))); 1544 1545 STORE_VECTOR (dest); 1546 1547 src += 4; 1548 dest += 4; 1549 mask += 4; 1550 } 1551 1552 for (i = width % 4; --i >= 0;) 1553 { 1554 uint32_t a = mask[i]; 1555 uint32_t s = src[i]; 1556 uint32_t d = dest[i]; 1557 uint32_t sa = ALPHA_8 (s); 1558 uint32_t da = ALPHA_8 (~d); 1559 1560 UN8x4_MUL_UN8x4 (s, a); 1561 UN8x4_MUL_UN8 (a, sa); 1562 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); 1563 1564 dest[i] = d; 1565 } 1566 } 1567 1568 static void 1569 vmx_combine_add_ca (pixman_implementation_t *imp, 1570 pixman_op_t op, 1571 uint32_t * dest, 1572 const uint32_t * src, 1573 const uint32_t * mask, 1574 int width) 1575 { 1576 int i; 1577 vector unsigned int vdest, vsrc, vmask; 1578 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, 1579 dest_mask, mask_mask, src_mask, store_mask; 1580 1581 COMPUTE_SHIFT_MASKC (dest, src, mask); 1582 1583 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1584 for (i = width / 4; i > 0; i--) 1585 { 1586 LOAD_VECTORSC (dest, src, mask); 1587 1588 vdest = pix_add (pix_multiply (vsrc, vmask), vdest); 1589 1590 STORE_VECTOR (dest); 1591 1592 src += 4; 1593 dest += 4; 1594 mask += 4; 1595 } 1596 1597 for (i = width % 4; --i >= 0;) 1598 { 1599 uint32_t a = mask[i]; 1600 uint32_t s = src[i]; 1601 uint32_t d = dest[i]; 1602 1603 UN8x4_MUL_UN8x4 (s, a); 1604 UN8x4_ADD_UN8x4 (s, d); 1605 1606 dest[i] = s; 1607 } 1608 } 1609 1610 static const pixman_fast_path_t vmx_fast_paths[] = 1611 { 1612 { PIXMAN_OP_NONE }, 1613 }; 1614 1615 pixman_implementation_t * 1616 _pixman_implementation_create_vmx (pixman_implementation_t *fallback) 1617 { 1618 pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths); 1619 1620 /* Set up function pointers */ 1621 1622 imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u; 1623 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u; 1624 imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u; 1625 imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u; 1626 imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u; 1627 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u; 1628 imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u; 1629 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u; 1630 imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u; 1631 1632 imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u; 1633 1634 imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca; 1635 imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca; 1636 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca; 1637 imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca; 1638 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca; 1639 imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca; 1640 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca; 1641 imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca; 1642 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca; 1643 imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca; 1644 imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca; 1645 1646 return imp; 1647 } 1648