1 /* 2 * Copyright 2008 Rodrigo Kumpera 3 * Copyright 2008 Andr Tupinamb 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of Red Hat not be used in advertising or 10 * publicity pertaining to distribution of the software without specific, 11 * written prior permission. Red Hat makes no representations about the 12 * suitability of this software for any purpose. It is provided "as is" 13 * without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Rodrigo Kumpera (kumpera (at) gmail.com) 25 * Andr Tupinamb (andrelrt (at) gmail.com) 26 * 27 * Based on work by Owen Taylor and Sren Sandmann 28 */ 29 #ifdef HAVE_CONFIG_H 30 #include <config.h> 31 #endif 32 33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ 34 #include <emmintrin.h> /* for SSE2 intrinsics */ 35 #include "pixman-private.h" 36 #include "pixman-combine32.h" 37 #include "pixman-inlines.h" 38 39 static __m128i mask_0080; 40 static __m128i mask_00ff; 41 static __m128i mask_0101; 42 static __m128i mask_ffff; 43 static __m128i mask_ff000000; 44 static __m128i mask_alpha; 45 46 static __m128i mask_565_r; 47 static __m128i mask_565_g1, mask_565_g2; 48 static __m128i mask_565_b; 49 static __m128i mask_red; 50 static __m128i mask_green; 51 static __m128i mask_blue; 52 53 static __m128i mask_565_fix_rb; 54 static __m128i mask_565_fix_g; 55 56 static __m128i mask_565_rb; 57 static __m128i mask_565_pack_multiplier; 58 59 static force_inline __m128i 60 unpack_32_1x128 (uint32_t data) 61 { 62 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); 63 } 64 65 static force_inline void 66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) 67 { 68 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); 69 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); 70 } 71 72 static force_inline __m128i 73 unpack_565_to_8888 (__m128i lo) 74 { 75 __m128i r, g, b, rb, t; 76 77 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); 78 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); 79 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); 80 81 rb = _mm_or_si128 (r, b); 82 t = _mm_and_si128 (rb, mask_565_fix_rb); 83 t = _mm_srli_epi32 (t, 5); 84 rb = _mm_or_si128 (rb, t); 85 86 t = _mm_and_si128 (g, mask_565_fix_g); 87 t = _mm_srli_epi32 (t, 6); 88 g = _mm_or_si128 (g, t); 89 90 return _mm_or_si128 (rb, g); 91 } 92 93 static force_inline void 94 unpack_565_128_4x128 (__m128i data, 95 __m128i* data0, 96 __m128i* data1, 97 __m128i* data2, 98 __m128i* data3) 99 { 100 __m128i lo, hi; 101 102 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); 103 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); 104 105 lo = unpack_565_to_8888 (lo); 106 hi = unpack_565_to_8888 (hi); 107 108 unpack_128_2x128 (lo, data0, data1); 109 unpack_128_2x128 (hi, data2, data3); 110 } 111 112 static force_inline uint16_t 113 pack_565_32_16 (uint32_t pixel) 114 { 115 return (uint16_t) (((pixel >> 8) & 0xf800) | 116 ((pixel >> 5) & 0x07e0) | 117 ((pixel >> 3) & 0x001f)); 118 } 119 120 static force_inline __m128i 121 pack_2x128_128 (__m128i lo, __m128i hi) 122 { 123 return _mm_packus_epi16 (lo, hi); 124 } 125 126 static force_inline __m128i 127 pack_565_2packedx128_128 (__m128i lo, __m128i hi) 128 { 129 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); 130 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); 131 132 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); 133 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); 134 135 __m128i g0 = _mm_and_si128 (lo, mask_green); 136 __m128i g1 = _mm_and_si128 (hi, mask_green); 137 138 t0 = _mm_or_si128 (t0, g0); 139 t1 = _mm_or_si128 (t1, g1); 140 141 /* Simulates _mm_packus_epi32 */ 142 t0 = _mm_slli_epi32 (t0, 16 - 5); 143 t1 = _mm_slli_epi32 (t1, 16 - 5); 144 t0 = _mm_srai_epi32 (t0, 16); 145 t1 = _mm_srai_epi32 (t1, 16); 146 return _mm_packs_epi32 (t0, t1); 147 } 148 149 static force_inline __m128i 150 pack_565_2x128_128 (__m128i lo, __m128i hi) 151 { 152 __m128i data; 153 __m128i r, g1, g2, b; 154 155 data = pack_2x128_128 (lo, hi); 156 157 r = _mm_and_si128 (data, mask_565_r); 158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); 159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); 160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); 161 162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); 163 } 164 165 static force_inline __m128i 166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) 167 { 168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), 169 pack_565_2x128_128 (*xmm2, *xmm3)); 170 } 171 172 static force_inline int 173 is_opaque (__m128i x) 174 { 175 __m128i ffs = _mm_cmpeq_epi8 (x, x); 176 177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; 178 } 179 180 static force_inline int 181 is_zero (__m128i x) 182 { 183 return _mm_movemask_epi8 ( 184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; 185 } 186 187 static force_inline int 188 is_transparent (__m128i x) 189 { 190 return (_mm_movemask_epi8 ( 191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; 192 } 193 194 static force_inline __m128i 195 expand_pixel_32_1x128 (uint32_t data) 196 { 197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); 198 } 199 200 static force_inline __m128i 201 expand_alpha_1x128 (__m128i data) 202 { 203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, 204 _MM_SHUFFLE (3, 3, 3, 3)), 205 _MM_SHUFFLE (3, 3, 3, 3)); 206 } 207 208 static force_inline void 209 expand_alpha_2x128 (__m128i data_lo, 210 __m128i data_hi, 211 __m128i* alpha_lo, 212 __m128i* alpha_hi) 213 { 214 __m128i lo, hi; 215 216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); 217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); 218 219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); 220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); 221 } 222 223 static force_inline void 224 expand_alpha_rev_2x128 (__m128i data_lo, 225 __m128i data_hi, 226 __m128i* alpha_lo, 227 __m128i* alpha_hi) 228 { 229 __m128i lo, hi; 230 231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); 232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); 233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); 234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); 235 } 236 237 static force_inline void 238 pix_multiply_2x128 (__m128i* data_lo, 239 __m128i* data_hi, 240 __m128i* alpha_lo, 241 __m128i* alpha_hi, 242 __m128i* ret_lo, 243 __m128i* ret_hi) 244 { 245 __m128i lo, hi; 246 247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); 248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); 249 lo = _mm_adds_epu16 (lo, mask_0080); 250 hi = _mm_adds_epu16 (hi, mask_0080); 251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); 252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); 253 } 254 255 static force_inline void 256 pix_add_multiply_2x128 (__m128i* src_lo, 257 __m128i* src_hi, 258 __m128i* alpha_dst_lo, 259 __m128i* alpha_dst_hi, 260 __m128i* dst_lo, 261 __m128i* dst_hi, 262 __m128i* alpha_src_lo, 263 __m128i* alpha_src_hi, 264 __m128i* ret_lo, 265 __m128i* ret_hi) 266 { 267 __m128i t1_lo, t1_hi; 268 __m128i t2_lo, t2_hi; 269 270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); 271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); 272 273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); 274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); 275 } 276 277 static force_inline void 278 negate_2x128 (__m128i data_lo, 279 __m128i data_hi, 280 __m128i* neg_lo, 281 __m128i* neg_hi) 282 { 283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); 284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); 285 } 286 287 static force_inline void 288 invert_colors_2x128 (__m128i data_lo, 289 __m128i data_hi, 290 __m128i* inv_lo, 291 __m128i* inv_hi) 292 { 293 __m128i lo, hi; 294 295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); 296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); 297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); 298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); 299 } 300 301 static force_inline void 302 over_2x128 (__m128i* src_lo, 303 __m128i* src_hi, 304 __m128i* alpha_lo, 305 __m128i* alpha_hi, 306 __m128i* dst_lo, 307 __m128i* dst_hi) 308 { 309 __m128i t1, t2; 310 311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); 312 313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); 314 315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); 316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); 317 } 318 319 static force_inline void 320 over_rev_non_pre_2x128 (__m128i src_lo, 321 __m128i src_hi, 322 __m128i* dst_lo, 323 __m128i* dst_hi) 324 { 325 __m128i lo, hi; 326 __m128i alpha_lo, alpha_hi; 327 328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); 329 330 lo = _mm_or_si128 (alpha_lo, mask_alpha); 331 hi = _mm_or_si128 (alpha_hi, mask_alpha); 332 333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); 334 335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); 336 337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); 338 } 339 340 static force_inline void 341 in_over_2x128 (__m128i* src_lo, 342 __m128i* src_hi, 343 __m128i* alpha_lo, 344 __m128i* alpha_hi, 345 __m128i* mask_lo, 346 __m128i* mask_hi, 347 __m128i* dst_lo, 348 __m128i* dst_hi) 349 { 350 __m128i s_lo, s_hi; 351 __m128i a_lo, a_hi; 352 353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); 354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); 355 356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); 357 } 358 359 /* load 4 pixels from a 16-byte boundary aligned address */ 360 static force_inline __m128i 361 load_128_aligned (__m128i* src) 362 { 363 return _mm_load_si128 (src); 364 } 365 366 /* load 4 pixels from a unaligned address */ 367 static force_inline __m128i 368 load_128_unaligned (const __m128i* src) 369 { 370 return _mm_loadu_si128 (src); 371 } 372 373 /* save 4 pixels using Write Combining memory on a 16-byte 374 * boundary aligned address 375 */ 376 static force_inline void 377 save_128_write_combining (__m128i* dst, 378 __m128i data) 379 { 380 _mm_stream_si128 (dst, data); 381 } 382 383 /* save 4 pixels on a 16-byte boundary aligned address */ 384 static force_inline void 385 save_128_aligned (__m128i* dst, 386 __m128i data) 387 { 388 _mm_store_si128 (dst, data); 389 } 390 391 /* save 4 pixels on a unaligned address */ 392 static force_inline void 393 save_128_unaligned (__m128i* dst, 394 __m128i data) 395 { 396 _mm_storeu_si128 (dst, data); 397 } 398 399 static force_inline __m128i 400 load_32_1x128 (uint32_t data) 401 { 402 return _mm_cvtsi32_si128 (data); 403 } 404 405 static force_inline __m128i 406 expand_alpha_rev_1x128 (__m128i data) 407 { 408 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); 409 } 410 411 static force_inline __m128i 412 expand_pixel_8_1x128 (uint8_t data) 413 { 414 return _mm_shufflelo_epi16 ( 415 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); 416 } 417 418 static force_inline __m128i 419 pix_multiply_1x128 (__m128i data, 420 __m128i alpha) 421 { 422 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), 423 mask_0080), 424 mask_0101); 425 } 426 427 static force_inline __m128i 428 pix_add_multiply_1x128 (__m128i* src, 429 __m128i* alpha_dst, 430 __m128i* dst, 431 __m128i* alpha_src) 432 { 433 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); 434 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); 435 436 return _mm_adds_epu8 (t1, t2); 437 } 438 439 static force_inline __m128i 440 negate_1x128 (__m128i data) 441 { 442 return _mm_xor_si128 (data, mask_00ff); 443 } 444 445 static force_inline __m128i 446 invert_colors_1x128 (__m128i data) 447 { 448 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); 449 } 450 451 static force_inline __m128i 452 over_1x128 (__m128i src, __m128i alpha, __m128i dst) 453 { 454 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); 455 } 456 457 static force_inline __m128i 458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) 459 { 460 return over_1x128 (pix_multiply_1x128 (*src, *mask), 461 pix_multiply_1x128 (*alpha, *mask), 462 *dst); 463 } 464 465 static force_inline __m128i 466 over_rev_non_pre_1x128 (__m128i src, __m128i dst) 467 { 468 __m128i alpha = expand_alpha_1x128 (src); 469 470 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), 471 _mm_or_si128 (alpha, mask_alpha)), 472 alpha, 473 dst); 474 } 475 476 static force_inline uint32_t 477 pack_1x128_32 (__m128i data) 478 { 479 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); 480 } 481 482 static force_inline __m128i 483 expand565_16_1x128 (uint16_t pixel) 484 { 485 __m128i m = _mm_cvtsi32_si128 (pixel); 486 487 m = unpack_565_to_8888 (m); 488 489 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); 490 } 491 492 static force_inline uint32_t 493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) 494 { 495 uint8_t a; 496 __m128i xmms; 497 498 a = src >> 24; 499 500 if (a == 0xff) 501 { 502 return src; 503 } 504 else if (src) 505 { 506 xmms = unpack_32_1x128 (src); 507 return pack_1x128_32 ( 508 over_1x128 (xmms, expand_alpha_1x128 (xmms), 509 unpack_32_1x128 (dst))); 510 } 511 512 return dst; 513 } 514 515 static force_inline uint32_t 516 combine1 (const uint32_t *ps, const uint32_t *pm) 517 { 518 uint32_t s = *ps; 519 520 if (pm) 521 { 522 __m128i ms, mm; 523 524 mm = unpack_32_1x128 (*pm); 525 mm = expand_alpha_1x128 (mm); 526 527 ms = unpack_32_1x128 (s); 528 ms = pix_multiply_1x128 (ms, mm); 529 530 s = pack_1x128_32 (ms); 531 } 532 533 return s; 534 } 535 536 static force_inline __m128i 537 combine4 (const __m128i *ps, const __m128i *pm) 538 { 539 __m128i xmm_src_lo, xmm_src_hi; 540 __m128i xmm_msk_lo, xmm_msk_hi; 541 __m128i s; 542 543 if (pm) 544 { 545 xmm_msk_lo = load_128_unaligned (pm); 546 547 if (is_transparent (xmm_msk_lo)) 548 return _mm_setzero_si128 (); 549 } 550 551 s = load_128_unaligned (ps); 552 553 if (pm) 554 { 555 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); 556 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); 557 558 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); 559 560 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 561 &xmm_msk_lo, &xmm_msk_hi, 562 &xmm_src_lo, &xmm_src_hi); 563 564 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); 565 } 566 567 return s; 568 } 569 570 static force_inline void 571 core_combine_over_u_sse2_mask (uint32_t * pd, 572 const uint32_t* ps, 573 const uint32_t* pm, 574 int w) 575 { 576 uint32_t s, d; 577 578 /* Align dst on a 16-byte boundary */ 579 while (w && ((uintptr_t)pd & 15)) 580 { 581 d = *pd; 582 s = combine1 (ps, pm); 583 584 if (s) 585 *pd = core_combine_over_u_pixel_sse2 (s, d); 586 pd++; 587 ps++; 588 pm++; 589 w--; 590 } 591 592 while (w >= 4) 593 { 594 __m128i mask = load_128_unaligned ((__m128i *)pm); 595 596 if (!is_zero (mask)) 597 { 598 __m128i src; 599 __m128i src_hi, src_lo; 600 __m128i mask_hi, mask_lo; 601 __m128i alpha_hi, alpha_lo; 602 603 src = load_128_unaligned ((__m128i *)ps); 604 605 if (is_opaque (_mm_and_si128 (src, mask))) 606 { 607 save_128_aligned ((__m128i *)pd, src); 608 } 609 else 610 { 611 __m128i dst = load_128_aligned ((__m128i *)pd); 612 __m128i dst_hi, dst_lo; 613 614 unpack_128_2x128 (mask, &mask_lo, &mask_hi); 615 unpack_128_2x128 (src, &src_lo, &src_hi); 616 617 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); 618 pix_multiply_2x128 (&src_lo, &src_hi, 619 &mask_lo, &mask_hi, 620 &src_lo, &src_hi); 621 622 unpack_128_2x128 (dst, &dst_lo, &dst_hi); 623 624 expand_alpha_2x128 (src_lo, src_hi, 625 &alpha_lo, &alpha_hi); 626 627 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, 628 &dst_lo, &dst_hi); 629 630 save_128_aligned ( 631 (__m128i *)pd, 632 pack_2x128_128 (dst_lo, dst_hi)); 633 } 634 } 635 636 pm += 4; 637 ps += 4; 638 pd += 4; 639 w -= 4; 640 } 641 while (w) 642 { 643 d = *pd; 644 s = combine1 (ps, pm); 645 646 if (s) 647 *pd = core_combine_over_u_pixel_sse2 (s, d); 648 pd++; 649 ps++; 650 pm++; 651 652 w--; 653 } 654 } 655 656 static force_inline void 657 core_combine_over_u_sse2_no_mask (uint32_t * pd, 658 const uint32_t* ps, 659 int w) 660 { 661 uint32_t s, d; 662 663 /* Align dst on a 16-byte boundary */ 664 while (w && ((uintptr_t)pd & 15)) 665 { 666 d = *pd; 667 s = *ps; 668 669 if (s) 670 *pd = core_combine_over_u_pixel_sse2 (s, d); 671 pd++; 672 ps++; 673 w--; 674 } 675 676 while (w >= 4) 677 { 678 __m128i src; 679 __m128i src_hi, src_lo, dst_hi, dst_lo; 680 __m128i alpha_hi, alpha_lo; 681 682 src = load_128_unaligned ((__m128i *)ps); 683 684 if (!is_zero (src)) 685 { 686 if (is_opaque (src)) 687 { 688 save_128_aligned ((__m128i *)pd, src); 689 } 690 else 691 { 692 __m128i dst = load_128_aligned ((__m128i *)pd); 693 694 unpack_128_2x128 (src, &src_lo, &src_hi); 695 unpack_128_2x128 (dst, &dst_lo, &dst_hi); 696 697 expand_alpha_2x128 (src_lo, src_hi, 698 &alpha_lo, &alpha_hi); 699 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, 700 &dst_lo, &dst_hi); 701 702 save_128_aligned ( 703 (__m128i *)pd, 704 pack_2x128_128 (dst_lo, dst_hi)); 705 } 706 } 707 708 ps += 4; 709 pd += 4; 710 w -= 4; 711 } 712 while (w) 713 { 714 d = *pd; 715 s = *ps; 716 717 if (s) 718 *pd = core_combine_over_u_pixel_sse2 (s, d); 719 pd++; 720 ps++; 721 722 w--; 723 } 724 } 725 726 static force_inline void 727 sse2_combine_over_u (pixman_implementation_t *imp, 728 pixman_op_t op, 729 uint32_t * pd, 730 const uint32_t * ps, 731 const uint32_t * pm, 732 int w) 733 { 734 if (pm) 735 core_combine_over_u_sse2_mask (pd, ps, pm, w); 736 else 737 core_combine_over_u_sse2_no_mask (pd, ps, w); 738 } 739 740 static void 741 sse2_combine_over_reverse_u (pixman_implementation_t *imp, 742 pixman_op_t op, 743 uint32_t * pd, 744 const uint32_t * ps, 745 const uint32_t * pm, 746 int w) 747 { 748 uint32_t s, d; 749 750 __m128i xmm_dst_lo, xmm_dst_hi; 751 __m128i xmm_src_lo, xmm_src_hi; 752 __m128i xmm_alpha_lo, xmm_alpha_hi; 753 754 /* Align dst on a 16-byte boundary */ 755 while (w && 756 ((uintptr_t)pd & 15)) 757 { 758 d = *pd; 759 s = combine1 (ps, pm); 760 761 *pd++ = core_combine_over_u_pixel_sse2 (d, s); 762 w--; 763 ps++; 764 if (pm) 765 pm++; 766 } 767 768 while (w >= 4) 769 { 770 /* I'm loading unaligned because I'm not sure 771 * about the address alignment. 772 */ 773 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 774 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 775 776 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 777 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 778 779 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 780 &xmm_alpha_lo, &xmm_alpha_hi); 781 782 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 783 &xmm_alpha_lo, &xmm_alpha_hi, 784 &xmm_src_lo, &xmm_src_hi); 785 786 /* rebuid the 4 pixel data and save*/ 787 save_128_aligned ((__m128i*)pd, 788 pack_2x128_128 (xmm_src_lo, xmm_src_hi)); 789 790 w -= 4; 791 ps += 4; 792 pd += 4; 793 794 if (pm) 795 pm += 4; 796 } 797 798 while (w) 799 { 800 d = *pd; 801 s = combine1 (ps, pm); 802 803 *pd++ = core_combine_over_u_pixel_sse2 (d, s); 804 ps++; 805 w--; 806 if (pm) 807 pm++; 808 } 809 } 810 811 static force_inline uint32_t 812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) 813 { 814 uint32_t maska = src >> 24; 815 816 if (maska == 0) 817 { 818 return 0; 819 } 820 else if (maska != 0xff) 821 { 822 return pack_1x128_32 ( 823 pix_multiply_1x128 (unpack_32_1x128 (dst), 824 expand_alpha_1x128 (unpack_32_1x128 (src)))); 825 } 826 827 return dst; 828 } 829 830 static void 831 sse2_combine_in_u (pixman_implementation_t *imp, 832 pixman_op_t op, 833 uint32_t * pd, 834 const uint32_t * ps, 835 const uint32_t * pm, 836 int w) 837 { 838 uint32_t s, d; 839 840 __m128i xmm_src_lo, xmm_src_hi; 841 __m128i xmm_dst_lo, xmm_dst_hi; 842 843 while (w && ((uintptr_t)pd & 15)) 844 { 845 s = combine1 (ps, pm); 846 d = *pd; 847 848 *pd++ = core_combine_in_u_pixel_sse2 (d, s); 849 w--; 850 ps++; 851 if (pm) 852 pm++; 853 } 854 855 while (w >= 4) 856 { 857 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 858 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); 859 860 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 861 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 862 863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 865 &xmm_dst_lo, &xmm_dst_hi, 866 &xmm_dst_lo, &xmm_dst_hi); 867 868 save_128_aligned ((__m128i*)pd, 869 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 870 871 ps += 4; 872 pd += 4; 873 w -= 4; 874 if (pm) 875 pm += 4; 876 } 877 878 while (w) 879 { 880 s = combine1 (ps, pm); 881 d = *pd; 882 883 *pd++ = core_combine_in_u_pixel_sse2 (d, s); 884 w--; 885 ps++; 886 if (pm) 887 pm++; 888 } 889 } 890 891 static void 892 sse2_combine_in_reverse_u (pixman_implementation_t *imp, 893 pixman_op_t op, 894 uint32_t * pd, 895 const uint32_t * ps, 896 const uint32_t * pm, 897 int w) 898 { 899 uint32_t s, d; 900 901 __m128i xmm_src_lo, xmm_src_hi; 902 __m128i xmm_dst_lo, xmm_dst_hi; 903 904 while (w && ((uintptr_t)pd & 15)) 905 { 906 s = combine1 (ps, pm); 907 d = *pd; 908 909 *pd++ = core_combine_in_u_pixel_sse2 (s, d); 910 ps++; 911 w--; 912 if (pm) 913 pm++; 914 } 915 916 while (w >= 4) 917 { 918 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 919 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); 920 921 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 922 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 923 924 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 925 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 926 &xmm_src_lo, &xmm_src_hi, 927 &xmm_dst_lo, &xmm_dst_hi); 928 929 save_128_aligned ( 930 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 931 932 ps += 4; 933 pd += 4; 934 w -= 4; 935 if (pm) 936 pm += 4; 937 } 938 939 while (w) 940 { 941 s = combine1 (ps, pm); 942 d = *pd; 943 944 *pd++ = core_combine_in_u_pixel_sse2 (s, d); 945 w--; 946 ps++; 947 if (pm) 948 pm++; 949 } 950 } 951 952 static void 953 sse2_combine_out_reverse_u (pixman_implementation_t *imp, 954 pixman_op_t op, 955 uint32_t * pd, 956 const uint32_t * ps, 957 const uint32_t * pm, 958 int w) 959 { 960 while (w && ((uintptr_t)pd & 15)) 961 { 962 uint32_t s = combine1 (ps, pm); 963 uint32_t d = *pd; 964 965 *pd++ = pack_1x128_32 ( 966 pix_multiply_1x128 ( 967 unpack_32_1x128 (d), negate_1x128 ( 968 expand_alpha_1x128 (unpack_32_1x128 (s))))); 969 970 if (pm) 971 pm++; 972 ps++; 973 w--; 974 } 975 976 while (w >= 4) 977 { 978 __m128i xmm_src_lo, xmm_src_hi; 979 __m128i xmm_dst_lo, xmm_dst_hi; 980 981 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 982 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 983 984 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 985 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 986 987 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 988 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 989 990 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 991 &xmm_src_lo, &xmm_src_hi, 992 &xmm_dst_lo, &xmm_dst_hi); 993 994 save_128_aligned ( 995 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 996 997 ps += 4; 998 pd += 4; 999 if (pm) 1000 pm += 4; 1001 1002 w -= 4; 1003 } 1004 1005 while (w) 1006 { 1007 uint32_t s = combine1 (ps, pm); 1008 uint32_t d = *pd; 1009 1010 *pd++ = pack_1x128_32 ( 1011 pix_multiply_1x128 ( 1012 unpack_32_1x128 (d), negate_1x128 ( 1013 expand_alpha_1x128 (unpack_32_1x128 (s))))); 1014 ps++; 1015 if (pm) 1016 pm++; 1017 w--; 1018 } 1019 } 1020 1021 static void 1022 sse2_combine_out_u (pixman_implementation_t *imp, 1023 pixman_op_t op, 1024 uint32_t * pd, 1025 const uint32_t * ps, 1026 const uint32_t * pm, 1027 int w) 1028 { 1029 while (w && ((uintptr_t)pd & 15)) 1030 { 1031 uint32_t s = combine1 (ps, pm); 1032 uint32_t d = *pd; 1033 1034 *pd++ = pack_1x128_32 ( 1035 pix_multiply_1x128 ( 1036 unpack_32_1x128 (s), negate_1x128 ( 1037 expand_alpha_1x128 (unpack_32_1x128 (d))))); 1038 w--; 1039 ps++; 1040 if (pm) 1041 pm++; 1042 } 1043 1044 while (w >= 4) 1045 { 1046 __m128i xmm_src_lo, xmm_src_hi; 1047 __m128i xmm_dst_lo, xmm_dst_hi; 1048 1049 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); 1050 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1051 1052 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1054 1055 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1056 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1057 1058 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1059 &xmm_dst_lo, &xmm_dst_hi, 1060 &xmm_dst_lo, &xmm_dst_hi); 1061 1062 save_128_aligned ( 1063 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1064 1065 ps += 4; 1066 pd += 4; 1067 w -= 4; 1068 if (pm) 1069 pm += 4; 1070 } 1071 1072 while (w) 1073 { 1074 uint32_t s = combine1 (ps, pm); 1075 uint32_t d = *pd; 1076 1077 *pd++ = pack_1x128_32 ( 1078 pix_multiply_1x128 ( 1079 unpack_32_1x128 (s), negate_1x128 ( 1080 expand_alpha_1x128 (unpack_32_1x128 (d))))); 1081 w--; 1082 ps++; 1083 if (pm) 1084 pm++; 1085 } 1086 } 1087 1088 static force_inline uint32_t 1089 core_combine_atop_u_pixel_sse2 (uint32_t src, 1090 uint32_t dst) 1091 { 1092 __m128i s = unpack_32_1x128 (src); 1093 __m128i d = unpack_32_1x128 (dst); 1094 1095 __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); 1096 __m128i da = expand_alpha_1x128 (d); 1097 1098 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); 1099 } 1100 1101 static void 1102 sse2_combine_atop_u (pixman_implementation_t *imp, 1103 pixman_op_t op, 1104 uint32_t * pd, 1105 const uint32_t * ps, 1106 const uint32_t * pm, 1107 int w) 1108 { 1109 uint32_t s, d; 1110 1111 __m128i xmm_src_lo, xmm_src_hi; 1112 __m128i xmm_dst_lo, xmm_dst_hi; 1113 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1114 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1115 1116 while (w && ((uintptr_t)pd & 15)) 1117 { 1118 s = combine1 (ps, pm); 1119 d = *pd; 1120 1121 *pd++ = core_combine_atop_u_pixel_sse2 (s, d); 1122 w--; 1123 ps++; 1124 if (pm) 1125 pm++; 1126 } 1127 1128 while (w >= 4) 1129 { 1130 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1131 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1132 1133 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1135 1136 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1137 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1139 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1140 1141 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, 1142 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1143 1144 pix_add_multiply_2x128 ( 1145 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1146 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1147 &xmm_dst_lo, &xmm_dst_hi); 1148 1149 save_128_aligned ( 1150 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1151 1152 ps += 4; 1153 pd += 4; 1154 w -= 4; 1155 if (pm) 1156 pm += 4; 1157 } 1158 1159 while (w) 1160 { 1161 s = combine1 (ps, pm); 1162 d = *pd; 1163 1164 *pd++ = core_combine_atop_u_pixel_sse2 (s, d); 1165 w--; 1166 ps++; 1167 if (pm) 1168 pm++; 1169 } 1170 } 1171 1172 static force_inline uint32_t 1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, 1174 uint32_t dst) 1175 { 1176 __m128i s = unpack_32_1x128 (src); 1177 __m128i d = unpack_32_1x128 (dst); 1178 1179 __m128i sa = expand_alpha_1x128 (s); 1180 __m128i da = negate_1x128 (expand_alpha_1x128 (d)); 1181 1182 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); 1183 } 1184 1185 static void 1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp, 1187 pixman_op_t op, 1188 uint32_t * pd, 1189 const uint32_t * ps, 1190 const uint32_t * pm, 1191 int w) 1192 { 1193 uint32_t s, d; 1194 1195 __m128i xmm_src_lo, xmm_src_hi; 1196 __m128i xmm_dst_lo, xmm_dst_hi; 1197 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1198 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1199 1200 while (w && ((uintptr_t)pd & 15)) 1201 { 1202 s = combine1 (ps, pm); 1203 d = *pd; 1204 1205 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); 1206 ps++; 1207 w--; 1208 if (pm) 1209 pm++; 1210 } 1211 1212 while (w >= 4) 1213 { 1214 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1215 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1216 1217 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1218 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1219 1220 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1221 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1222 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1223 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1224 1225 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1227 1228 pix_add_multiply_2x128 ( 1229 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1230 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1231 &xmm_dst_lo, &xmm_dst_hi); 1232 1233 save_128_aligned ( 1234 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1235 1236 ps += 4; 1237 pd += 4; 1238 w -= 4; 1239 if (pm) 1240 pm += 4; 1241 } 1242 1243 while (w) 1244 { 1245 s = combine1 (ps, pm); 1246 d = *pd; 1247 1248 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); 1249 ps++; 1250 w--; 1251 if (pm) 1252 pm++; 1253 } 1254 } 1255 1256 static force_inline uint32_t 1257 core_combine_xor_u_pixel_sse2 (uint32_t src, 1258 uint32_t dst) 1259 { 1260 __m128i s = unpack_32_1x128 (src); 1261 __m128i d = unpack_32_1x128 (dst); 1262 1263 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); 1264 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); 1265 1266 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); 1267 } 1268 1269 static void 1270 sse2_combine_xor_u (pixman_implementation_t *imp, 1271 pixman_op_t op, 1272 uint32_t * dst, 1273 const uint32_t * src, 1274 const uint32_t * mask, 1275 int width) 1276 { 1277 int w = width; 1278 uint32_t s, d; 1279 uint32_t* pd = dst; 1280 const uint32_t* ps = src; 1281 const uint32_t* pm = mask; 1282 1283 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1284 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1285 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1286 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1287 1288 while (w && ((uintptr_t)pd & 15)) 1289 { 1290 s = combine1 (ps, pm); 1291 d = *pd; 1292 1293 *pd++ = core_combine_xor_u_pixel_sse2 (s, d); 1294 w--; 1295 ps++; 1296 if (pm) 1297 pm++; 1298 } 1299 1300 while (w >= 4) 1301 { 1302 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); 1303 xmm_dst = load_128_aligned ((__m128i*) pd); 1304 1305 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1306 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1307 1308 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1309 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1310 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1311 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1312 1313 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, 1314 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1315 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1316 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1317 1318 pix_add_multiply_2x128 ( 1319 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1320 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1321 &xmm_dst_lo, &xmm_dst_hi); 1322 1323 save_128_aligned ( 1324 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1325 1326 ps += 4; 1327 pd += 4; 1328 w -= 4; 1329 if (pm) 1330 pm += 4; 1331 } 1332 1333 while (w) 1334 { 1335 s = combine1 (ps, pm); 1336 d = *pd; 1337 1338 *pd++ = core_combine_xor_u_pixel_sse2 (s, d); 1339 w--; 1340 ps++; 1341 if (pm) 1342 pm++; 1343 } 1344 } 1345 1346 static force_inline void 1347 sse2_combine_add_u (pixman_implementation_t *imp, 1348 pixman_op_t op, 1349 uint32_t * dst, 1350 const uint32_t * src, 1351 const uint32_t * mask, 1352 int width) 1353 { 1354 int w = width; 1355 uint32_t s, d; 1356 uint32_t* pd = dst; 1357 const uint32_t* ps = src; 1358 const uint32_t* pm = mask; 1359 1360 while (w && (uintptr_t)pd & 15) 1361 { 1362 s = combine1 (ps, pm); 1363 d = *pd; 1364 1365 ps++; 1366 if (pm) 1367 pm++; 1368 *pd++ = _mm_cvtsi128_si32 ( 1369 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); 1370 w--; 1371 } 1372 1373 while (w >= 4) 1374 { 1375 __m128i s; 1376 1377 s = combine4 ((__m128i*)ps, (__m128i*)pm); 1378 1379 save_128_aligned ( 1380 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); 1381 1382 pd += 4; 1383 ps += 4; 1384 if (pm) 1385 pm += 4; 1386 w -= 4; 1387 } 1388 1389 while (w--) 1390 { 1391 s = combine1 (ps, pm); 1392 d = *pd; 1393 1394 ps++; 1395 *pd++ = _mm_cvtsi128_si32 ( 1396 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); 1397 if (pm) 1398 pm++; 1399 } 1400 } 1401 1402 static force_inline uint32_t 1403 core_combine_saturate_u_pixel_sse2 (uint32_t src, 1404 uint32_t dst) 1405 { 1406 __m128i ms = unpack_32_1x128 (src); 1407 __m128i md = unpack_32_1x128 (dst); 1408 uint32_t sa = src >> 24; 1409 uint32_t da = ~dst >> 24; 1410 1411 if (sa > da) 1412 { 1413 ms = pix_multiply_1x128 ( 1414 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); 1415 } 1416 1417 return pack_1x128_32 (_mm_adds_epu16 (md, ms)); 1418 } 1419 1420 static void 1421 sse2_combine_saturate_u (pixman_implementation_t *imp, 1422 pixman_op_t op, 1423 uint32_t * pd, 1424 const uint32_t * ps, 1425 const uint32_t * pm, 1426 int w) 1427 { 1428 uint32_t s, d; 1429 1430 uint32_t pack_cmp; 1431 __m128i xmm_src, xmm_dst; 1432 1433 while (w && (uintptr_t)pd & 15) 1434 { 1435 s = combine1 (ps, pm); 1436 d = *pd; 1437 1438 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1439 w--; 1440 ps++; 1441 if (pm) 1442 pm++; 1443 } 1444 1445 while (w >= 4) 1446 { 1447 xmm_dst = load_128_aligned ((__m128i*)pd); 1448 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); 1449 1450 pack_cmp = _mm_movemask_epi8 ( 1451 _mm_cmpgt_epi32 ( 1452 _mm_srli_epi32 (xmm_src, 24), 1453 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); 1454 1455 /* if some alpha src is grater than respective ~alpha dst */ 1456 if (pack_cmp) 1457 { 1458 s = combine1 (ps++, pm); 1459 d = *pd; 1460 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1461 if (pm) 1462 pm++; 1463 1464 s = combine1 (ps++, pm); 1465 d = *pd; 1466 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1467 if (pm) 1468 pm++; 1469 1470 s = combine1 (ps++, pm); 1471 d = *pd; 1472 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1473 if (pm) 1474 pm++; 1475 1476 s = combine1 (ps++, pm); 1477 d = *pd; 1478 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1479 if (pm) 1480 pm++; 1481 } 1482 else 1483 { 1484 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); 1485 1486 pd += 4; 1487 ps += 4; 1488 if (pm) 1489 pm += 4; 1490 } 1491 1492 w -= 4; 1493 } 1494 1495 while (w--) 1496 { 1497 s = combine1 (ps, pm); 1498 d = *pd; 1499 1500 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1501 ps++; 1502 if (pm) 1503 pm++; 1504 } 1505 } 1506 1507 static void 1508 sse2_combine_src_ca (pixman_implementation_t *imp, 1509 pixman_op_t op, 1510 uint32_t * pd, 1511 const uint32_t * ps, 1512 const uint32_t * pm, 1513 int w) 1514 { 1515 uint32_t s, m; 1516 1517 __m128i xmm_src_lo, xmm_src_hi; 1518 __m128i xmm_mask_lo, xmm_mask_hi; 1519 __m128i xmm_dst_lo, xmm_dst_hi; 1520 1521 while (w && (uintptr_t)pd & 15) 1522 { 1523 s = *ps++; 1524 m = *pm++; 1525 *pd++ = pack_1x128_32 ( 1526 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); 1527 w--; 1528 } 1529 1530 while (w >= 4) 1531 { 1532 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1533 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1534 1535 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1536 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1537 1538 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1539 &xmm_mask_lo, &xmm_mask_hi, 1540 &xmm_dst_lo, &xmm_dst_hi); 1541 1542 save_128_aligned ( 1543 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1544 1545 ps += 4; 1546 pd += 4; 1547 pm += 4; 1548 w -= 4; 1549 } 1550 1551 while (w) 1552 { 1553 s = *ps++; 1554 m = *pm++; 1555 *pd++ = pack_1x128_32 ( 1556 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); 1557 w--; 1558 } 1559 } 1560 1561 static force_inline uint32_t 1562 core_combine_over_ca_pixel_sse2 (uint32_t src, 1563 uint32_t mask, 1564 uint32_t dst) 1565 { 1566 __m128i s = unpack_32_1x128 (src); 1567 __m128i expAlpha = expand_alpha_1x128 (s); 1568 __m128i unpk_mask = unpack_32_1x128 (mask); 1569 __m128i unpk_dst = unpack_32_1x128 (dst); 1570 1571 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); 1572 } 1573 1574 static void 1575 sse2_combine_over_ca (pixman_implementation_t *imp, 1576 pixman_op_t op, 1577 uint32_t * pd, 1578 const uint32_t * ps, 1579 const uint32_t * pm, 1580 int w) 1581 { 1582 uint32_t s, m, d; 1583 1584 __m128i xmm_alpha_lo, xmm_alpha_hi; 1585 __m128i xmm_src_lo, xmm_src_hi; 1586 __m128i xmm_dst_lo, xmm_dst_hi; 1587 __m128i xmm_mask_lo, xmm_mask_hi; 1588 1589 while (w && (uintptr_t)pd & 15) 1590 { 1591 s = *ps++; 1592 m = *pm++; 1593 d = *pd; 1594 1595 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); 1596 w--; 1597 } 1598 1599 while (w >= 4) 1600 { 1601 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1602 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1603 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1604 1605 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1606 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1607 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1608 1609 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1610 &xmm_alpha_lo, &xmm_alpha_hi); 1611 1612 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1613 &xmm_alpha_lo, &xmm_alpha_hi, 1614 &xmm_mask_lo, &xmm_mask_hi, 1615 &xmm_dst_lo, &xmm_dst_hi); 1616 1617 save_128_aligned ( 1618 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1619 1620 ps += 4; 1621 pd += 4; 1622 pm += 4; 1623 w -= 4; 1624 } 1625 1626 while (w) 1627 { 1628 s = *ps++; 1629 m = *pm++; 1630 d = *pd; 1631 1632 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); 1633 w--; 1634 } 1635 } 1636 1637 static force_inline uint32_t 1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, 1639 uint32_t mask, 1640 uint32_t dst) 1641 { 1642 __m128i d = unpack_32_1x128 (dst); 1643 1644 return pack_1x128_32 ( 1645 over_1x128 (d, expand_alpha_1x128 (d), 1646 pix_multiply_1x128 (unpack_32_1x128 (src), 1647 unpack_32_1x128 (mask)))); 1648 } 1649 1650 static void 1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp, 1652 pixman_op_t op, 1653 uint32_t * pd, 1654 const uint32_t * ps, 1655 const uint32_t * pm, 1656 int w) 1657 { 1658 uint32_t s, m, d; 1659 1660 __m128i xmm_alpha_lo, xmm_alpha_hi; 1661 __m128i xmm_src_lo, xmm_src_hi; 1662 __m128i xmm_dst_lo, xmm_dst_hi; 1663 __m128i xmm_mask_lo, xmm_mask_hi; 1664 1665 while (w && (uintptr_t)pd & 15) 1666 { 1667 s = *ps++; 1668 m = *pm++; 1669 d = *pd; 1670 1671 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); 1672 w--; 1673 } 1674 1675 while (w >= 4) 1676 { 1677 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1678 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1679 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1680 1681 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1682 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1683 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1684 1685 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1686 &xmm_alpha_lo, &xmm_alpha_hi); 1687 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1688 &xmm_mask_lo, &xmm_mask_hi, 1689 &xmm_mask_lo, &xmm_mask_hi); 1690 1691 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1692 &xmm_alpha_lo, &xmm_alpha_hi, 1693 &xmm_mask_lo, &xmm_mask_hi); 1694 1695 save_128_aligned ( 1696 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); 1697 1698 ps += 4; 1699 pd += 4; 1700 pm += 4; 1701 w -= 4; 1702 } 1703 1704 while (w) 1705 { 1706 s = *ps++; 1707 m = *pm++; 1708 d = *pd; 1709 1710 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); 1711 w--; 1712 } 1713 } 1714 1715 static void 1716 sse2_combine_in_ca (pixman_implementation_t *imp, 1717 pixman_op_t op, 1718 uint32_t * pd, 1719 const uint32_t * ps, 1720 const uint32_t * pm, 1721 int w) 1722 { 1723 uint32_t s, m, d; 1724 1725 __m128i xmm_alpha_lo, xmm_alpha_hi; 1726 __m128i xmm_src_lo, xmm_src_hi; 1727 __m128i xmm_dst_lo, xmm_dst_hi; 1728 __m128i xmm_mask_lo, xmm_mask_hi; 1729 1730 while (w && (uintptr_t)pd & 15) 1731 { 1732 s = *ps++; 1733 m = *pm++; 1734 d = *pd; 1735 1736 *pd++ = pack_1x128_32 ( 1737 pix_multiply_1x128 ( 1738 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), 1739 expand_alpha_1x128 (unpack_32_1x128 (d)))); 1740 1741 w--; 1742 } 1743 1744 while (w >= 4) 1745 { 1746 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1747 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1748 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1749 1750 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1751 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1752 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1753 1754 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1755 &xmm_alpha_lo, &xmm_alpha_hi); 1756 1757 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1758 &xmm_mask_lo, &xmm_mask_hi, 1759 &xmm_dst_lo, &xmm_dst_hi); 1760 1761 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1762 &xmm_alpha_lo, &xmm_alpha_hi, 1763 &xmm_dst_lo, &xmm_dst_hi); 1764 1765 save_128_aligned ( 1766 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1767 1768 ps += 4; 1769 pd += 4; 1770 pm += 4; 1771 w -= 4; 1772 } 1773 1774 while (w) 1775 { 1776 s = *ps++; 1777 m = *pm++; 1778 d = *pd; 1779 1780 *pd++ = pack_1x128_32 ( 1781 pix_multiply_1x128 ( 1782 pix_multiply_1x128 ( 1783 unpack_32_1x128 (s), unpack_32_1x128 (m)), 1784 expand_alpha_1x128 (unpack_32_1x128 (d)))); 1785 1786 w--; 1787 } 1788 } 1789 1790 static void 1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp, 1792 pixman_op_t op, 1793 uint32_t * pd, 1794 const uint32_t * ps, 1795 const uint32_t * pm, 1796 int w) 1797 { 1798 uint32_t s, m, d; 1799 1800 __m128i xmm_alpha_lo, xmm_alpha_hi; 1801 __m128i xmm_src_lo, xmm_src_hi; 1802 __m128i xmm_dst_lo, xmm_dst_hi; 1803 __m128i xmm_mask_lo, xmm_mask_hi; 1804 1805 while (w && (uintptr_t)pd & 15) 1806 { 1807 s = *ps++; 1808 m = *pm++; 1809 d = *pd; 1810 1811 *pd++ = pack_1x128_32 ( 1812 pix_multiply_1x128 ( 1813 unpack_32_1x128 (d), 1814 pix_multiply_1x128 (unpack_32_1x128 (m), 1815 expand_alpha_1x128 (unpack_32_1x128 (s))))); 1816 w--; 1817 } 1818 1819 while (w >= 4) 1820 { 1821 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1822 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1823 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1824 1825 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1826 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1827 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1828 1829 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1830 &xmm_alpha_lo, &xmm_alpha_hi); 1831 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1832 &xmm_alpha_lo, &xmm_alpha_hi, 1833 &xmm_alpha_lo, &xmm_alpha_hi); 1834 1835 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1836 &xmm_alpha_lo, &xmm_alpha_hi, 1837 &xmm_dst_lo, &xmm_dst_hi); 1838 1839 save_128_aligned ( 1840 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1841 1842 ps += 4; 1843 pd += 4; 1844 pm += 4; 1845 w -= 4; 1846 } 1847 1848 while (w) 1849 { 1850 s = *ps++; 1851 m = *pm++; 1852 d = *pd; 1853 1854 *pd++ = pack_1x128_32 ( 1855 pix_multiply_1x128 ( 1856 unpack_32_1x128 (d), 1857 pix_multiply_1x128 (unpack_32_1x128 (m), 1858 expand_alpha_1x128 (unpack_32_1x128 (s))))); 1859 w--; 1860 } 1861 } 1862 1863 static void 1864 sse2_combine_out_ca (pixman_implementation_t *imp, 1865 pixman_op_t op, 1866 uint32_t * pd, 1867 const uint32_t * ps, 1868 const uint32_t * pm, 1869 int w) 1870 { 1871 uint32_t s, m, d; 1872 1873 __m128i xmm_alpha_lo, xmm_alpha_hi; 1874 __m128i xmm_src_lo, xmm_src_hi; 1875 __m128i xmm_dst_lo, xmm_dst_hi; 1876 __m128i xmm_mask_lo, xmm_mask_hi; 1877 1878 while (w && (uintptr_t)pd & 15) 1879 { 1880 s = *ps++; 1881 m = *pm++; 1882 d = *pd; 1883 1884 *pd++ = pack_1x128_32 ( 1885 pix_multiply_1x128 ( 1886 pix_multiply_1x128 ( 1887 unpack_32_1x128 (s), unpack_32_1x128 (m)), 1888 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); 1889 w--; 1890 } 1891 1892 while (w >= 4) 1893 { 1894 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1895 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1896 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1897 1898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1900 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1901 1902 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1903 &xmm_alpha_lo, &xmm_alpha_hi); 1904 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, 1905 &xmm_alpha_lo, &xmm_alpha_hi); 1906 1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1908 &xmm_mask_lo, &xmm_mask_hi, 1909 &xmm_dst_lo, &xmm_dst_hi); 1910 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1911 &xmm_alpha_lo, &xmm_alpha_hi, 1912 &xmm_dst_lo, &xmm_dst_hi); 1913 1914 save_128_aligned ( 1915 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1916 1917 ps += 4; 1918 pd += 4; 1919 pm += 4; 1920 w -= 4; 1921 } 1922 1923 while (w) 1924 { 1925 s = *ps++; 1926 m = *pm++; 1927 d = *pd; 1928 1929 *pd++ = pack_1x128_32 ( 1930 pix_multiply_1x128 ( 1931 pix_multiply_1x128 ( 1932 unpack_32_1x128 (s), unpack_32_1x128 (m)), 1933 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); 1934 1935 w--; 1936 } 1937 } 1938 1939 static void 1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp, 1941 pixman_op_t op, 1942 uint32_t * pd, 1943 const uint32_t * ps, 1944 const uint32_t * pm, 1945 int w) 1946 { 1947 uint32_t s, m, d; 1948 1949 __m128i xmm_alpha_lo, xmm_alpha_hi; 1950 __m128i xmm_src_lo, xmm_src_hi; 1951 __m128i xmm_dst_lo, xmm_dst_hi; 1952 __m128i xmm_mask_lo, xmm_mask_hi; 1953 1954 while (w && (uintptr_t)pd & 15) 1955 { 1956 s = *ps++; 1957 m = *pm++; 1958 d = *pd; 1959 1960 *pd++ = pack_1x128_32 ( 1961 pix_multiply_1x128 ( 1962 unpack_32_1x128 (d), 1963 negate_1x128 (pix_multiply_1x128 ( 1964 unpack_32_1x128 (m), 1965 expand_alpha_1x128 (unpack_32_1x128 (s)))))); 1966 w--; 1967 } 1968 1969 while (w >= 4) 1970 { 1971 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1972 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1973 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1974 1975 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1976 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1977 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1978 1979 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1980 &xmm_alpha_lo, &xmm_alpha_hi); 1981 1982 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1983 &xmm_alpha_lo, &xmm_alpha_hi, 1984 &xmm_mask_lo, &xmm_mask_hi); 1985 1986 negate_2x128 (xmm_mask_lo, xmm_mask_hi, 1987 &xmm_mask_lo, &xmm_mask_hi); 1988 1989 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1990 &xmm_mask_lo, &xmm_mask_hi, 1991 &xmm_dst_lo, &xmm_dst_hi); 1992 1993 save_128_aligned ( 1994 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1995 1996 ps += 4; 1997 pd += 4; 1998 pm += 4; 1999 w -= 4; 2000 } 2001 2002 while (w) 2003 { 2004 s = *ps++; 2005 m = *pm++; 2006 d = *pd; 2007 2008 *pd++ = pack_1x128_32 ( 2009 pix_multiply_1x128 ( 2010 unpack_32_1x128 (d), 2011 negate_1x128 (pix_multiply_1x128 ( 2012 unpack_32_1x128 (m), 2013 expand_alpha_1x128 (unpack_32_1x128 (s)))))); 2014 w--; 2015 } 2016 } 2017 2018 static force_inline uint32_t 2019 core_combine_atop_ca_pixel_sse2 (uint32_t src, 2020 uint32_t mask, 2021 uint32_t dst) 2022 { 2023 __m128i m = unpack_32_1x128 (mask); 2024 __m128i s = unpack_32_1x128 (src); 2025 __m128i d = unpack_32_1x128 (dst); 2026 __m128i sa = expand_alpha_1x128 (s); 2027 __m128i da = expand_alpha_1x128 (d); 2028 2029 s = pix_multiply_1x128 (s, m); 2030 m = negate_1x128 (pix_multiply_1x128 (m, sa)); 2031 2032 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); 2033 } 2034 2035 static void 2036 sse2_combine_atop_ca (pixman_implementation_t *imp, 2037 pixman_op_t op, 2038 uint32_t * pd, 2039 const uint32_t * ps, 2040 const uint32_t * pm, 2041 int w) 2042 { 2043 uint32_t s, m, d; 2044 2045 __m128i xmm_src_lo, xmm_src_hi; 2046 __m128i xmm_dst_lo, xmm_dst_hi; 2047 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 2048 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 2049 __m128i xmm_mask_lo, xmm_mask_hi; 2050 2051 while (w && (uintptr_t)pd & 15) 2052 { 2053 s = *ps++; 2054 m = *pm++; 2055 d = *pd; 2056 2057 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); 2058 w--; 2059 } 2060 2061 while (w >= 4) 2062 { 2063 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2064 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2065 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2066 2067 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2068 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2069 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2070 2071 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2072 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 2073 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 2074 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2075 2076 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2077 &xmm_mask_lo, &xmm_mask_hi, 2078 &xmm_src_lo, &xmm_src_hi); 2079 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 2080 &xmm_alpha_src_lo, &xmm_alpha_src_hi, 2081 &xmm_mask_lo, &xmm_mask_hi); 2082 2083 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2084 2085 pix_add_multiply_2x128 ( 2086 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 2087 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 2088 &xmm_dst_lo, &xmm_dst_hi); 2089 2090 save_128_aligned ( 2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2092 2093 ps += 4; 2094 pd += 4; 2095 pm += 4; 2096 w -= 4; 2097 } 2098 2099 while (w) 2100 { 2101 s = *ps++; 2102 m = *pm++; 2103 d = *pd; 2104 2105 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); 2106 w--; 2107 } 2108 } 2109 2110 static force_inline uint32_t 2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, 2112 uint32_t mask, 2113 uint32_t dst) 2114 { 2115 __m128i m = unpack_32_1x128 (mask); 2116 __m128i s = unpack_32_1x128 (src); 2117 __m128i d = unpack_32_1x128 (dst); 2118 2119 __m128i da = negate_1x128 (expand_alpha_1x128 (d)); 2120 __m128i sa = expand_alpha_1x128 (s); 2121 2122 s = pix_multiply_1x128 (s, m); 2123 m = pix_multiply_1x128 (m, sa); 2124 2125 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); 2126 } 2127 2128 static void 2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, 2130 pixman_op_t op, 2131 uint32_t * pd, 2132 const uint32_t * ps, 2133 const uint32_t * pm, 2134 int w) 2135 { 2136 uint32_t s, m, d; 2137 2138 __m128i xmm_src_lo, xmm_src_hi; 2139 __m128i xmm_dst_lo, xmm_dst_hi; 2140 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 2141 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 2142 __m128i xmm_mask_lo, xmm_mask_hi; 2143 2144 while (w && (uintptr_t)pd & 15) 2145 { 2146 s = *ps++; 2147 m = *pm++; 2148 d = *pd; 2149 2150 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); 2151 w--; 2152 } 2153 2154 while (w >= 4) 2155 { 2156 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2157 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2158 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2159 2160 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2161 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2162 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2163 2164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2165 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 2166 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 2167 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2168 2169 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2170 &xmm_mask_lo, &xmm_mask_hi, 2171 &xmm_src_lo, &xmm_src_hi); 2172 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 2173 &xmm_alpha_src_lo, &xmm_alpha_src_hi, 2174 &xmm_mask_lo, &xmm_mask_hi); 2175 2176 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 2177 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2178 2179 pix_add_multiply_2x128 ( 2180 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 2181 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 2182 &xmm_dst_lo, &xmm_dst_hi); 2183 2184 save_128_aligned ( 2185 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2186 2187 ps += 4; 2188 pd += 4; 2189 pm += 4; 2190 w -= 4; 2191 } 2192 2193 while (w) 2194 { 2195 s = *ps++; 2196 m = *pm++; 2197 d = *pd; 2198 2199 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); 2200 w--; 2201 } 2202 } 2203 2204 static force_inline uint32_t 2205 core_combine_xor_ca_pixel_sse2 (uint32_t src, 2206 uint32_t mask, 2207 uint32_t dst) 2208 { 2209 __m128i a = unpack_32_1x128 (mask); 2210 __m128i s = unpack_32_1x128 (src); 2211 __m128i d = unpack_32_1x128 (dst); 2212 2213 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( 2214 a, expand_alpha_1x128 (s))); 2215 __m128i dest = pix_multiply_1x128 (s, a); 2216 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); 2217 2218 return pack_1x128_32 (pix_add_multiply_1x128 (&d, 2219 &alpha_dst, 2220 &dest, 2221 &alpha_src)); 2222 } 2223 2224 static void 2225 sse2_combine_xor_ca (pixman_implementation_t *imp, 2226 pixman_op_t op, 2227 uint32_t * pd, 2228 const uint32_t * ps, 2229 const uint32_t * pm, 2230 int w) 2231 { 2232 uint32_t s, m, d; 2233 2234 __m128i xmm_src_lo, xmm_src_hi; 2235 __m128i xmm_dst_lo, xmm_dst_hi; 2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 2238 __m128i xmm_mask_lo, xmm_mask_hi; 2239 2240 while (w && (uintptr_t)pd & 15) 2241 { 2242 s = *ps++; 2243 m = *pm++; 2244 d = *pd; 2245 2246 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); 2247 w--; 2248 } 2249 2250 while (w >= 4) 2251 { 2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2255 2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2259 2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2264 2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2266 &xmm_mask_lo, &xmm_mask_hi, 2267 &xmm_src_lo, &xmm_src_hi); 2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi, 2270 &xmm_mask_lo, &xmm_mask_hi); 2271 2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2274 negate_2x128 (xmm_mask_lo, xmm_mask_hi, 2275 &xmm_mask_lo, &xmm_mask_hi); 2276 2277 pix_add_multiply_2x128 ( 2278 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 2279 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 2280 &xmm_dst_lo, &xmm_dst_hi); 2281 2282 save_128_aligned ( 2283 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2284 2285 ps += 4; 2286 pd += 4; 2287 pm += 4; 2288 w -= 4; 2289 } 2290 2291 while (w) 2292 { 2293 s = *ps++; 2294 m = *pm++; 2295 d = *pd; 2296 2297 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); 2298 w--; 2299 } 2300 } 2301 2302 static void 2303 sse2_combine_add_ca (pixman_implementation_t *imp, 2304 pixman_op_t op, 2305 uint32_t * pd, 2306 const uint32_t * ps, 2307 const uint32_t * pm, 2308 int w) 2309 { 2310 uint32_t s, m, d; 2311 2312 __m128i xmm_src_lo, xmm_src_hi; 2313 __m128i xmm_dst_lo, xmm_dst_hi; 2314 __m128i xmm_mask_lo, xmm_mask_hi; 2315 2316 while (w && (uintptr_t)pd & 15) 2317 { 2318 s = *ps++; 2319 m = *pm++; 2320 d = *pd; 2321 2322 *pd++ = pack_1x128_32 ( 2323 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), 2324 unpack_32_1x128 (m)), 2325 unpack_32_1x128 (d))); 2326 w--; 2327 } 2328 2329 while (w >= 4) 2330 { 2331 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2332 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2333 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2334 2335 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2336 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2337 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2338 2339 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2340 &xmm_mask_lo, &xmm_mask_hi, 2341 &xmm_src_lo, &xmm_src_hi); 2342 2343 save_128_aligned ( 2344 (__m128i*)pd, pack_2x128_128 ( 2345 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), 2346 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); 2347 2348 ps += 4; 2349 pd += 4; 2350 pm += 4; 2351 w -= 4; 2352 } 2353 2354 while (w) 2355 { 2356 s = *ps++; 2357 m = *pm++; 2358 d = *pd; 2359 2360 *pd++ = pack_1x128_32 ( 2361 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), 2362 unpack_32_1x128 (m)), 2363 unpack_32_1x128 (d))); 2364 w--; 2365 } 2366 } 2367 2368 static force_inline __m128i 2369 create_mask_16_128 (uint16_t mask) 2370 { 2371 return _mm_set1_epi16 (mask); 2372 } 2373 2374 /* Work around a code generation bug in Sun Studio 12. */ 2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) 2376 # define create_mask_2x32_128(mask0, mask1) \ 2377 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) 2378 #else 2379 static force_inline __m128i 2380 create_mask_2x32_128 (uint32_t mask0, 2381 uint32_t mask1) 2382 { 2383 return _mm_set_epi32 (mask0, mask1, mask0, mask1); 2384 } 2385 #endif 2386 2387 static void 2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp, 2389 pixman_composite_info_t *info) 2390 { 2391 PIXMAN_COMPOSITE_ARGS (info); 2392 uint32_t src; 2393 uint32_t *dst_line, *dst, d; 2394 int32_t w; 2395 int dst_stride; 2396 __m128i xmm_src, xmm_alpha; 2397 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2398 2399 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2400 2401 if (src == 0) 2402 return; 2403 2404 PIXMAN_IMAGE_GET_LINE ( 2405 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2406 2407 xmm_src = expand_pixel_32_1x128 (src); 2408 xmm_alpha = expand_alpha_1x128 (xmm_src); 2409 2410 while (height--) 2411 { 2412 dst = dst_line; 2413 2414 dst_line += dst_stride; 2415 w = width; 2416 2417 while (w && (uintptr_t)dst & 15) 2418 { 2419 d = *dst; 2420 *dst++ = pack_1x128_32 (over_1x128 (xmm_src, 2421 xmm_alpha, 2422 unpack_32_1x128 (d))); 2423 w--; 2424 } 2425 2426 while (w >= 4) 2427 { 2428 xmm_dst = load_128_aligned ((__m128i*)dst); 2429 2430 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 2431 2432 over_2x128 (&xmm_src, &xmm_src, 2433 &xmm_alpha, &xmm_alpha, 2434 &xmm_dst_lo, &xmm_dst_hi); 2435 2436 /* rebuid the 4 pixel data and save*/ 2437 save_128_aligned ( 2438 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2439 2440 w -= 4; 2441 dst += 4; 2442 } 2443 2444 while (w) 2445 { 2446 d = *dst; 2447 *dst++ = pack_1x128_32 (over_1x128 (xmm_src, 2448 xmm_alpha, 2449 unpack_32_1x128 (d))); 2450 w--; 2451 } 2452 2453 } 2454 } 2455 2456 static void 2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp, 2458 pixman_composite_info_t *info) 2459 { 2460 PIXMAN_COMPOSITE_ARGS (info); 2461 uint32_t src; 2462 uint16_t *dst_line, *dst, d; 2463 int32_t w; 2464 int dst_stride; 2465 __m128i xmm_src, xmm_alpha; 2466 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 2467 2468 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2469 2470 if (src == 0) 2471 return; 2472 2473 PIXMAN_IMAGE_GET_LINE ( 2474 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2475 2476 xmm_src = expand_pixel_32_1x128 (src); 2477 xmm_alpha = expand_alpha_1x128 (xmm_src); 2478 2479 while (height--) 2480 { 2481 dst = dst_line; 2482 2483 dst_line += dst_stride; 2484 w = width; 2485 2486 while (w && (uintptr_t)dst & 15) 2487 { 2488 d = *dst; 2489 2490 *dst++ = pack_565_32_16 ( 2491 pack_1x128_32 (over_1x128 (xmm_src, 2492 xmm_alpha, 2493 expand565_16_1x128 (d)))); 2494 w--; 2495 } 2496 2497 while (w >= 8) 2498 { 2499 xmm_dst = load_128_aligned ((__m128i*)dst); 2500 2501 unpack_565_128_4x128 (xmm_dst, 2502 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 2503 2504 over_2x128 (&xmm_src, &xmm_src, 2505 &xmm_alpha, &xmm_alpha, 2506 &xmm_dst0, &xmm_dst1); 2507 over_2x128 (&xmm_src, &xmm_src, 2508 &xmm_alpha, &xmm_alpha, 2509 &xmm_dst2, &xmm_dst3); 2510 2511 xmm_dst = pack_565_4x128_128 ( 2512 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 2513 2514 save_128_aligned ((__m128i*)dst, xmm_dst); 2515 2516 dst += 8; 2517 w -= 8; 2518 } 2519 2520 while (w--) 2521 { 2522 d = *dst; 2523 *dst++ = pack_565_32_16 ( 2524 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, 2525 expand565_16_1x128 (d)))); 2526 } 2527 } 2528 2529 } 2530 2531 static void 2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, 2533 pixman_composite_info_t *info) 2534 { 2535 PIXMAN_COMPOSITE_ARGS (info); 2536 uint32_t src; 2537 uint32_t *dst_line, d; 2538 uint32_t *mask_line, m; 2539 uint32_t pack_cmp; 2540 int dst_stride, mask_stride; 2541 2542 __m128i xmm_src; 2543 __m128i xmm_dst; 2544 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 2545 2546 __m128i mmx_src, mmx_mask, mmx_dest; 2547 2548 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2549 2550 if (src == 0) 2551 return; 2552 2553 PIXMAN_IMAGE_GET_LINE ( 2554 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2555 PIXMAN_IMAGE_GET_LINE ( 2556 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 2557 2558 xmm_src = _mm_unpacklo_epi8 ( 2559 create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); 2560 mmx_src = xmm_src; 2561 2562 while (height--) 2563 { 2564 int w = width; 2565 const uint32_t *pm = (uint32_t *)mask_line; 2566 uint32_t *pd = (uint32_t *)dst_line; 2567 2568 dst_line += dst_stride; 2569 mask_line += mask_stride; 2570 2571 while (w && (uintptr_t)pd & 15) 2572 { 2573 m = *pm++; 2574 2575 if (m) 2576 { 2577 d = *pd; 2578 2579 mmx_mask = unpack_32_1x128 (m); 2580 mmx_dest = unpack_32_1x128 (d); 2581 2582 *pd = pack_1x128_32 ( 2583 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), 2584 mmx_dest)); 2585 } 2586 2587 pd++; 2588 w--; 2589 } 2590 2591 while (w >= 4) 2592 { 2593 xmm_mask = load_128_unaligned ((__m128i*)pm); 2594 2595 pack_cmp = 2596 _mm_movemask_epi8 ( 2597 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 2598 2599 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ 2600 if (pack_cmp != 0xffff) 2601 { 2602 xmm_dst = load_128_aligned ((__m128i*)pd); 2603 2604 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 2605 2606 pix_multiply_2x128 (&xmm_src, &xmm_src, 2607 &xmm_mask_lo, &xmm_mask_hi, 2608 &xmm_mask_lo, &xmm_mask_hi); 2609 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); 2610 2611 save_128_aligned ( 2612 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); 2613 } 2614 2615 pd += 4; 2616 pm += 4; 2617 w -= 4; 2618 } 2619 2620 while (w) 2621 { 2622 m = *pm++; 2623 2624 if (m) 2625 { 2626 d = *pd; 2627 2628 mmx_mask = unpack_32_1x128 (m); 2629 mmx_dest = unpack_32_1x128 (d); 2630 2631 *pd = pack_1x128_32 ( 2632 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), 2633 mmx_dest)); 2634 } 2635 2636 pd++; 2637 w--; 2638 } 2639 } 2640 2641 } 2642 2643 static void 2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 2645 pixman_composite_info_t *info) 2646 { 2647 PIXMAN_COMPOSITE_ARGS (info); 2648 uint32_t src; 2649 uint32_t *dst_line, d; 2650 uint32_t *mask_line, m; 2651 uint32_t pack_cmp; 2652 int dst_stride, mask_stride; 2653 2654 __m128i xmm_src, xmm_alpha; 2655 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2656 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 2657 2658 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 2659 2660 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2661 2662 if (src == 0) 2663 return; 2664 2665 PIXMAN_IMAGE_GET_LINE ( 2666 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2667 PIXMAN_IMAGE_GET_LINE ( 2668 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 2669 2670 xmm_src = _mm_unpacklo_epi8 ( 2671 create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); 2672 xmm_alpha = expand_alpha_1x128 (xmm_src); 2673 mmx_src = xmm_src; 2674 mmx_alpha = xmm_alpha; 2675 2676 while (height--) 2677 { 2678 int w = width; 2679 const uint32_t *pm = (uint32_t *)mask_line; 2680 uint32_t *pd = (uint32_t *)dst_line; 2681 2682 dst_line += dst_stride; 2683 mask_line += mask_stride; 2684 2685 while (w && (uintptr_t)pd & 15) 2686 { 2687 m = *pm++; 2688 2689 if (m) 2690 { 2691 d = *pd; 2692 mmx_mask = unpack_32_1x128 (m); 2693 mmx_dest = unpack_32_1x128 (d); 2694 2695 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, 2696 &mmx_alpha, 2697 &mmx_mask, 2698 &mmx_dest)); 2699 } 2700 2701 pd++; 2702 w--; 2703 } 2704 2705 while (w >= 4) 2706 { 2707 xmm_mask = load_128_unaligned ((__m128i*)pm); 2708 2709 pack_cmp = 2710 _mm_movemask_epi8 ( 2711 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 2712 2713 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ 2714 if (pack_cmp != 0xffff) 2715 { 2716 xmm_dst = load_128_aligned ((__m128i*)pd); 2717 2718 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 2719 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 2720 2721 in_over_2x128 (&xmm_src, &xmm_src, 2722 &xmm_alpha, &xmm_alpha, 2723 &xmm_mask_lo, &xmm_mask_hi, 2724 &xmm_dst_lo, &xmm_dst_hi); 2725 2726 save_128_aligned ( 2727 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2728 } 2729 2730 pd += 4; 2731 pm += 4; 2732 w -= 4; 2733 } 2734 2735 while (w) 2736 { 2737 m = *pm++; 2738 2739 if (m) 2740 { 2741 d = *pd; 2742 mmx_mask = unpack_32_1x128 (m); 2743 mmx_dest = unpack_32_1x128 (d); 2744 2745 *pd = pack_1x128_32 ( 2746 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); 2747 } 2748 2749 pd++; 2750 w--; 2751 } 2752 } 2753 2754 } 2755 2756 static void 2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, 2758 pixman_composite_info_t *info) 2759 { 2760 PIXMAN_COMPOSITE_ARGS (info); 2761 uint32_t *dst_line, *dst; 2762 uint32_t *src_line, *src; 2763 uint32_t mask; 2764 int32_t w; 2765 int dst_stride, src_stride; 2766 2767 __m128i xmm_mask; 2768 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 2769 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2770 __m128i xmm_alpha_lo, xmm_alpha_hi; 2771 2772 PIXMAN_IMAGE_GET_LINE ( 2773 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2774 PIXMAN_IMAGE_GET_LINE ( 2775 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2776 2777 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); 2778 2779 xmm_mask = create_mask_16_128 (mask >> 24); 2780 2781 while (height--) 2782 { 2783 dst = dst_line; 2784 dst_line += dst_stride; 2785 src = src_line; 2786 src_line += src_stride; 2787 w = width; 2788 2789 while (w && (uintptr_t)dst & 15) 2790 { 2791 uint32_t s = *src++; 2792 2793 if (s) 2794 { 2795 uint32_t d = *dst; 2796 2797 __m128i ms = unpack_32_1x128 (s); 2798 __m128i alpha = expand_alpha_1x128 (ms); 2799 __m128i dest = xmm_mask; 2800 __m128i alpha_dst = unpack_32_1x128 (d); 2801 2802 *dst = pack_1x128_32 ( 2803 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 2804 } 2805 dst++; 2806 w--; 2807 } 2808 2809 while (w >= 4) 2810 { 2811 xmm_src = load_128_unaligned ((__m128i*)src); 2812 2813 if (!is_zero (xmm_src)) 2814 { 2815 xmm_dst = load_128_aligned ((__m128i*)dst); 2816 2817 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 2818 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 2819 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2820 &xmm_alpha_lo, &xmm_alpha_hi); 2821 2822 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 2823 &xmm_alpha_lo, &xmm_alpha_hi, 2824 &xmm_mask, &xmm_mask, 2825 &xmm_dst_lo, &xmm_dst_hi); 2826 2827 save_128_aligned ( 2828 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2829 } 2830 2831 dst += 4; 2832 src += 4; 2833 w -= 4; 2834 } 2835 2836 while (w) 2837 { 2838 uint32_t s = *src++; 2839 2840 if (s) 2841 { 2842 uint32_t d = *dst; 2843 2844 __m128i ms = unpack_32_1x128 (s); 2845 __m128i alpha = expand_alpha_1x128 (ms); 2846 __m128i mask = xmm_mask; 2847 __m128i dest = unpack_32_1x128 (d); 2848 2849 *dst = pack_1x128_32 ( 2850 in_over_1x128 (&ms, &alpha, &mask, &dest)); 2851 } 2852 2853 dst++; 2854 w--; 2855 } 2856 } 2857 2858 } 2859 2860 static void 2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp, 2862 pixman_composite_info_t *info) 2863 { 2864 PIXMAN_COMPOSITE_ARGS (info); 2865 uint16_t *dst_line, *dst; 2866 uint32_t *src_line, *src, s; 2867 int dst_stride, src_stride; 2868 int32_t w; 2869 2870 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2871 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2872 2873 while (height--) 2874 { 2875 dst = dst_line; 2876 dst_line += dst_stride; 2877 src = src_line; 2878 src_line += src_stride; 2879 w = width; 2880 2881 while (w && (uintptr_t)dst & 15) 2882 { 2883 s = *src++; 2884 *dst = convert_8888_to_0565 (s); 2885 dst++; 2886 w--; 2887 } 2888 2889 while (w >= 8) 2890 { 2891 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); 2892 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); 2893 2894 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); 2895 2896 w -= 8; 2897 src += 8; 2898 dst += 8; 2899 } 2900 2901 while (w) 2902 { 2903 s = *src++; 2904 *dst = convert_8888_to_0565 (s); 2905 dst++; 2906 w--; 2907 } 2908 } 2909 } 2910 2911 static void 2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp, 2913 pixman_composite_info_t *info) 2914 { 2915 PIXMAN_COMPOSITE_ARGS (info); 2916 uint32_t *dst_line, *dst; 2917 uint32_t *src_line, *src; 2918 int32_t w; 2919 int dst_stride, src_stride; 2920 2921 2922 PIXMAN_IMAGE_GET_LINE ( 2923 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2924 PIXMAN_IMAGE_GET_LINE ( 2925 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2926 2927 while (height--) 2928 { 2929 dst = dst_line; 2930 dst_line += dst_stride; 2931 src = src_line; 2932 src_line += src_stride; 2933 w = width; 2934 2935 while (w && (uintptr_t)dst & 15) 2936 { 2937 *dst++ = *src++ | 0xff000000; 2938 w--; 2939 } 2940 2941 while (w >= 16) 2942 { 2943 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; 2944 2945 xmm_src1 = load_128_unaligned ((__m128i*)src + 0); 2946 xmm_src2 = load_128_unaligned ((__m128i*)src + 1); 2947 xmm_src3 = load_128_unaligned ((__m128i*)src + 2); 2948 xmm_src4 = load_128_unaligned ((__m128i*)src + 3); 2949 2950 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); 2951 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); 2952 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); 2953 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); 2954 2955 dst += 16; 2956 src += 16; 2957 w -= 16; 2958 } 2959 2960 while (w) 2961 { 2962 *dst++ = *src++ | 0xff000000; 2963 w--; 2964 } 2965 } 2966 2967 } 2968 2969 static void 2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, 2971 pixman_composite_info_t *info) 2972 { 2973 PIXMAN_COMPOSITE_ARGS (info); 2974 uint32_t *dst_line, *dst; 2975 uint32_t *src_line, *src; 2976 uint32_t mask; 2977 int dst_stride, src_stride; 2978 int32_t w; 2979 2980 __m128i xmm_mask, xmm_alpha; 2981 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 2982 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2983 2984 PIXMAN_IMAGE_GET_LINE ( 2985 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2986 PIXMAN_IMAGE_GET_LINE ( 2987 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2988 2989 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); 2990 2991 xmm_mask = create_mask_16_128 (mask >> 24); 2992 xmm_alpha = mask_00ff; 2993 2994 while (height--) 2995 { 2996 dst = dst_line; 2997 dst_line += dst_stride; 2998 src = src_line; 2999 src_line += src_stride; 3000 w = width; 3001 3002 while (w && (uintptr_t)dst & 15) 3003 { 3004 uint32_t s = (*src++) | 0xff000000; 3005 uint32_t d = *dst; 3006 3007 __m128i src = unpack_32_1x128 (s); 3008 __m128i alpha = xmm_alpha; 3009 __m128i mask = xmm_mask; 3010 __m128i dest = unpack_32_1x128 (d); 3011 3012 *dst++ = pack_1x128_32 ( 3013 in_over_1x128 (&src, &alpha, &mask, &dest)); 3014 3015 w--; 3016 } 3017 3018 while (w >= 4) 3019 { 3020 xmm_src = _mm_or_si128 ( 3021 load_128_unaligned ((__m128i*)src), mask_ff000000); 3022 xmm_dst = load_128_aligned ((__m128i*)dst); 3023 3024 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3025 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 3026 3027 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 3028 &xmm_alpha, &xmm_alpha, 3029 &xmm_mask, &xmm_mask, 3030 &xmm_dst_lo, &xmm_dst_hi); 3031 3032 save_128_aligned ( 3033 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3034 3035 dst += 4; 3036 src += 4; 3037 w -= 4; 3038 3039 } 3040 3041 while (w) 3042 { 3043 uint32_t s = (*src++) | 0xff000000; 3044 uint32_t d = *dst; 3045 3046 __m128i src = unpack_32_1x128 (s); 3047 __m128i alpha = xmm_alpha; 3048 __m128i mask = xmm_mask; 3049 __m128i dest = unpack_32_1x128 (d); 3050 3051 *dst++ = pack_1x128_32 ( 3052 in_over_1x128 (&src, &alpha, &mask, &dest)); 3053 3054 w--; 3055 } 3056 } 3057 3058 } 3059 3060 static void 3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp, 3062 pixman_composite_info_t *info) 3063 { 3064 PIXMAN_COMPOSITE_ARGS (info); 3065 int dst_stride, src_stride; 3066 uint32_t *dst_line, *dst; 3067 uint32_t *src_line, *src; 3068 3069 PIXMAN_IMAGE_GET_LINE ( 3070 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3071 PIXMAN_IMAGE_GET_LINE ( 3072 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3073 3074 dst = dst_line; 3075 src = src_line; 3076 3077 while (height--) 3078 { 3079 sse2_combine_over_u (imp, op, dst, src, NULL, width); 3080 3081 dst += dst_stride; 3082 src += src_stride; 3083 } 3084 } 3085 3086 static force_inline uint16_t 3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst) 3088 { 3089 __m128i ms; 3090 3091 ms = unpack_32_1x128 (src); 3092 return pack_565_32_16 ( 3093 pack_1x128_32 ( 3094 over_1x128 ( 3095 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); 3096 } 3097 3098 static void 3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp, 3100 pixman_composite_info_t *info) 3101 { 3102 PIXMAN_COMPOSITE_ARGS (info); 3103 uint16_t *dst_line, *dst, d; 3104 uint32_t *src_line, *src, s; 3105 int dst_stride, src_stride; 3106 int32_t w; 3107 3108 __m128i xmm_alpha_lo, xmm_alpha_hi; 3109 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 3110 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3111 3112 PIXMAN_IMAGE_GET_LINE ( 3113 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3114 PIXMAN_IMAGE_GET_LINE ( 3115 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3116 3117 while (height--) 3118 { 3119 dst = dst_line; 3120 src = src_line; 3121 3122 dst_line += dst_stride; 3123 src_line += src_stride; 3124 w = width; 3125 3126 /* Align dst on a 16-byte boundary */ 3127 while (w && 3128 ((uintptr_t)dst & 15)) 3129 { 3130 s = *src++; 3131 d = *dst; 3132 3133 *dst++ = composite_over_8888_0565pixel (s, d); 3134 w--; 3135 } 3136 3137 /* It's a 8 pixel loop */ 3138 while (w >= 8) 3139 { 3140 /* I'm loading unaligned because I'm not sure 3141 * about the address alignment. 3142 */ 3143 xmm_src = load_128_unaligned ((__m128i*) src); 3144 xmm_dst = load_128_aligned ((__m128i*) dst); 3145 3146 /* Unpacking */ 3147 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3148 unpack_565_128_4x128 (xmm_dst, 3149 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 3151 &xmm_alpha_lo, &xmm_alpha_hi); 3152 3153 /* I'm loading next 4 pixels from memory 3154 * before to optimze the memory read. 3155 */ 3156 xmm_src = load_128_unaligned ((__m128i*) (src + 4)); 3157 3158 over_2x128 (&xmm_src_lo, &xmm_src_hi, 3159 &xmm_alpha_lo, &xmm_alpha_hi, 3160 &xmm_dst0, &xmm_dst1); 3161 3162 /* Unpacking */ 3163 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 3165 &xmm_alpha_lo, &xmm_alpha_hi); 3166 3167 over_2x128 (&xmm_src_lo, &xmm_src_hi, 3168 &xmm_alpha_lo, &xmm_alpha_hi, 3169 &xmm_dst2, &xmm_dst3); 3170 3171 save_128_aligned ( 3172 (__m128i*)dst, pack_565_4x128_128 ( 3173 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 3174 3175 w -= 8; 3176 dst += 8; 3177 src += 8; 3178 } 3179 3180 while (w--) 3181 { 3182 s = *src++; 3183 d = *dst; 3184 3185 *dst++ = composite_over_8888_0565pixel (s, d); 3186 } 3187 } 3188 3189 } 3190 3191 static void 3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, 3193 pixman_composite_info_t *info) 3194 { 3195 PIXMAN_COMPOSITE_ARGS (info); 3196 uint32_t src, srca; 3197 uint32_t *dst_line, *dst; 3198 uint8_t *mask_line, *mask; 3199 int dst_stride, mask_stride; 3200 int32_t w; 3201 uint32_t m, d; 3202 3203 __m128i xmm_src, xmm_alpha, xmm_def; 3204 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 3205 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3206 3207 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 3208 3209 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3210 3211 srca = src >> 24; 3212 if (src == 0) 3213 return; 3214 3215 PIXMAN_IMAGE_GET_LINE ( 3216 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3217 PIXMAN_IMAGE_GET_LINE ( 3218 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3219 3220 xmm_def = create_mask_2x32_128 (src, src); 3221 xmm_src = expand_pixel_32_1x128 (src); 3222 xmm_alpha = expand_alpha_1x128 (xmm_src); 3223 mmx_src = xmm_src; 3224 mmx_alpha = xmm_alpha; 3225 3226 while (height--) 3227 { 3228 dst = dst_line; 3229 dst_line += dst_stride; 3230 mask = mask_line; 3231 mask_line += mask_stride; 3232 w = width; 3233 3234 while (w && (uintptr_t)dst & 15) 3235 { 3236 uint8_t m = *mask++; 3237 3238 if (m) 3239 { 3240 d = *dst; 3241 mmx_mask = expand_pixel_8_1x128 (m); 3242 mmx_dest = unpack_32_1x128 (d); 3243 3244 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, 3245 &mmx_alpha, 3246 &mmx_mask, 3247 &mmx_dest)); 3248 } 3249 3250 w--; 3251 dst++; 3252 } 3253 3254 while (w >= 4) 3255 { 3256 m = *((uint32_t*)mask); 3257 3258 if (srca == 0xff && m == 0xffffffff) 3259 { 3260 save_128_aligned ((__m128i*)dst, xmm_def); 3261 } 3262 else if (m) 3263 { 3264 xmm_dst = load_128_aligned ((__m128i*) dst); 3265 xmm_mask = unpack_32_1x128 (m); 3266 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3267 3268 /* Unpacking */ 3269 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 3270 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3271 3272 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3273 &xmm_mask_lo, &xmm_mask_hi); 3274 3275 in_over_2x128 (&xmm_src, &xmm_src, 3276 &xmm_alpha, &xmm_alpha, 3277 &xmm_mask_lo, &xmm_mask_hi, 3278 &xmm_dst_lo, &xmm_dst_hi); 3279 3280 save_128_aligned ( 3281 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3282 } 3283 3284 w -= 4; 3285 dst += 4; 3286 mask += 4; 3287 } 3288 3289 while (w) 3290 { 3291 uint8_t m = *mask++; 3292 3293 if (m) 3294 { 3295 d = *dst; 3296 mmx_mask = expand_pixel_8_1x128 (m); 3297 mmx_dest = unpack_32_1x128 (d); 3298 3299 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, 3300 &mmx_alpha, 3301 &mmx_mask, 3302 &mmx_dest)); 3303 } 3304 3305 w--; 3306 dst++; 3307 } 3308 } 3309 3310 } 3311 3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) 3313 __attribute__((__force_align_arg_pointer__)) 3314 #endif 3315 static pixman_bool_t 3316 sse2_fill (pixman_implementation_t *imp, 3317 uint32_t * bits, 3318 int stride, 3319 int bpp, 3320 int x, 3321 int y, 3322 int width, 3323 int height, 3324 uint32_t filler) 3325 { 3326 uint32_t byte_width; 3327 uint8_t *byte_line; 3328 3329 __m128i xmm_def; 3330 3331 if (bpp == 8) 3332 { 3333 uint8_t b; 3334 uint16_t w; 3335 3336 stride = stride * (int) sizeof (uint32_t) / 1; 3337 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 3338 byte_width = width; 3339 stride *= 1; 3340 3341 b = filler & 0xff; 3342 w = (b << 8) | b; 3343 filler = (w << 16) | w; 3344 } 3345 else if (bpp == 16) 3346 { 3347 stride = stride * (int) sizeof (uint32_t) / 2; 3348 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 3349 byte_width = 2 * width; 3350 stride *= 2; 3351 3352 filler = (filler & 0xffff) * 0x00010001; 3353 } 3354 else if (bpp == 32) 3355 { 3356 stride = stride * (int) sizeof (uint32_t) / 4; 3357 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 3358 byte_width = 4 * width; 3359 stride *= 4; 3360 } 3361 else 3362 { 3363 return FALSE; 3364 } 3365 3366 xmm_def = create_mask_2x32_128 (filler, filler); 3367 3368 while (height--) 3369 { 3370 int w; 3371 uint8_t *d = byte_line; 3372 byte_line += stride; 3373 w = byte_width; 3374 3375 if (w >= 1 && ((uintptr_t)d & 1)) 3376 { 3377 *(uint8_t *)d = filler; 3378 w -= 1; 3379 d += 1; 3380 } 3381 3382 while (w >= 2 && ((uintptr_t)d & 3)) 3383 { 3384 *(uint16_t *)d = filler; 3385 w -= 2; 3386 d += 2; 3387 } 3388 3389 while (w >= 4 && ((uintptr_t)d & 15)) 3390 { 3391 *(uint32_t *)d = filler; 3392 3393 w -= 4; 3394 d += 4; 3395 } 3396 3397 while (w >= 128) 3398 { 3399 save_128_aligned ((__m128i*)(d), xmm_def); 3400 save_128_aligned ((__m128i*)(d + 16), xmm_def); 3401 save_128_aligned ((__m128i*)(d + 32), xmm_def); 3402 save_128_aligned ((__m128i*)(d + 48), xmm_def); 3403 save_128_aligned ((__m128i*)(d + 64), xmm_def); 3404 save_128_aligned ((__m128i*)(d + 80), xmm_def); 3405 save_128_aligned ((__m128i*)(d + 96), xmm_def); 3406 save_128_aligned ((__m128i*)(d + 112), xmm_def); 3407 3408 d += 128; 3409 w -= 128; 3410 } 3411 3412 if (w >= 64) 3413 { 3414 save_128_aligned ((__m128i*)(d), xmm_def); 3415 save_128_aligned ((__m128i*)(d + 16), xmm_def); 3416 save_128_aligned ((__m128i*)(d + 32), xmm_def); 3417 save_128_aligned ((__m128i*)(d + 48), xmm_def); 3418 3419 d += 64; 3420 w -= 64; 3421 } 3422 3423 if (w >= 32) 3424 { 3425 save_128_aligned ((__m128i*)(d), xmm_def); 3426 save_128_aligned ((__m128i*)(d + 16), xmm_def); 3427 3428 d += 32; 3429 w -= 32; 3430 } 3431 3432 if (w >= 16) 3433 { 3434 save_128_aligned ((__m128i*)(d), xmm_def); 3435 3436 d += 16; 3437 w -= 16; 3438 } 3439 3440 while (w >= 4) 3441 { 3442 *(uint32_t *)d = filler; 3443 3444 w -= 4; 3445 d += 4; 3446 } 3447 3448 if (w >= 2) 3449 { 3450 *(uint16_t *)d = filler; 3451 w -= 2; 3452 d += 2; 3453 } 3454 3455 if (w >= 1) 3456 { 3457 *(uint8_t *)d = filler; 3458 w -= 1; 3459 d += 1; 3460 } 3461 } 3462 3463 return TRUE; 3464 } 3465 3466 static void 3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, 3468 pixman_composite_info_t *info) 3469 { 3470 PIXMAN_COMPOSITE_ARGS (info); 3471 uint32_t src, srca; 3472 uint32_t *dst_line, *dst; 3473 uint8_t *mask_line, *mask; 3474 int dst_stride, mask_stride; 3475 int32_t w; 3476 uint32_t m; 3477 3478 __m128i xmm_src, xmm_def; 3479 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3480 3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3482 3483 srca = src >> 24; 3484 if (src == 0) 3485 { 3486 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 3487 PIXMAN_FORMAT_BPP (dest_image->bits.format), 3488 dest_x, dest_y, width, height, 0); 3489 return; 3490 } 3491 3492 PIXMAN_IMAGE_GET_LINE ( 3493 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3494 PIXMAN_IMAGE_GET_LINE ( 3495 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3496 3497 xmm_def = create_mask_2x32_128 (src, src); 3498 xmm_src = expand_pixel_32_1x128 (src); 3499 3500 while (height--) 3501 { 3502 dst = dst_line; 3503 dst_line += dst_stride; 3504 mask = mask_line; 3505 mask_line += mask_stride; 3506 w = width; 3507 3508 while (w && (uintptr_t)dst & 15) 3509 { 3510 uint8_t m = *mask++; 3511 3512 if (m) 3513 { 3514 *dst = pack_1x128_32 ( 3515 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); 3516 } 3517 else 3518 { 3519 *dst = 0; 3520 } 3521 3522 w--; 3523 dst++; 3524 } 3525 3526 while (w >= 4) 3527 { 3528 m = *((uint32_t*)mask); 3529 3530 if (srca == 0xff && m == 0xffffffff) 3531 { 3532 save_128_aligned ((__m128i*)dst, xmm_def); 3533 } 3534 else if (m) 3535 { 3536 xmm_mask = unpack_32_1x128 (m); 3537 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3538 3539 /* Unpacking */ 3540 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3541 3542 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3543 &xmm_mask_lo, &xmm_mask_hi); 3544 3545 pix_multiply_2x128 (&xmm_src, &xmm_src, 3546 &xmm_mask_lo, &xmm_mask_hi, 3547 &xmm_mask_lo, &xmm_mask_hi); 3548 3549 save_128_aligned ( 3550 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); 3551 } 3552 else 3553 { 3554 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); 3555 } 3556 3557 w -= 4; 3558 dst += 4; 3559 mask += 4; 3560 } 3561 3562 while (w) 3563 { 3564 uint8_t m = *mask++; 3565 3566 if (m) 3567 { 3568 *dst = pack_1x128_32 ( 3569 pix_multiply_1x128 ( 3570 xmm_src, expand_pixel_8_1x128 (m))); 3571 } 3572 else 3573 { 3574 *dst = 0; 3575 } 3576 3577 w--; 3578 dst++; 3579 } 3580 } 3581 3582 } 3583 3584 static void 3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, 3586 pixman_composite_info_t *info) 3587 { 3588 PIXMAN_COMPOSITE_ARGS (info); 3589 uint32_t src; 3590 uint16_t *dst_line, *dst, d; 3591 uint8_t *mask_line, *mask; 3592 int dst_stride, mask_stride; 3593 int32_t w; 3594 uint32_t m; 3595 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 3596 3597 __m128i xmm_src, xmm_alpha; 3598 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3599 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3600 3601 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3602 3603 if (src == 0) 3604 return; 3605 3606 PIXMAN_IMAGE_GET_LINE ( 3607 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3608 PIXMAN_IMAGE_GET_LINE ( 3609 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3610 3611 xmm_src = expand_pixel_32_1x128 (src); 3612 xmm_alpha = expand_alpha_1x128 (xmm_src); 3613 mmx_src = xmm_src; 3614 mmx_alpha = xmm_alpha; 3615 3616 while (height--) 3617 { 3618 dst = dst_line; 3619 dst_line += dst_stride; 3620 mask = mask_line; 3621 mask_line += mask_stride; 3622 w = width; 3623 3624 while (w && (uintptr_t)dst & 15) 3625 { 3626 m = *mask++; 3627 3628 if (m) 3629 { 3630 d = *dst; 3631 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 3632 mmx_dest = expand565_16_1x128 (d); 3633 3634 *dst = pack_565_32_16 ( 3635 pack_1x128_32 ( 3636 in_over_1x128 ( 3637 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 3638 } 3639 3640 w--; 3641 dst++; 3642 } 3643 3644 while (w >= 8) 3645 { 3646 xmm_dst = load_128_aligned ((__m128i*) dst); 3647 unpack_565_128_4x128 (xmm_dst, 3648 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3649 3650 m = *((uint32_t*)mask); 3651 mask += 4; 3652 3653 if (m) 3654 { 3655 xmm_mask = unpack_32_1x128 (m); 3656 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3657 3658 /* Unpacking */ 3659 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3660 3661 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3662 &xmm_mask_lo, &xmm_mask_hi); 3663 3664 in_over_2x128 (&xmm_src, &xmm_src, 3665 &xmm_alpha, &xmm_alpha, 3666 &xmm_mask_lo, &xmm_mask_hi, 3667 &xmm_dst0, &xmm_dst1); 3668 } 3669 3670 m = *((uint32_t*)mask); 3671 mask += 4; 3672 3673 if (m) 3674 { 3675 xmm_mask = unpack_32_1x128 (m); 3676 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3677 3678 /* Unpacking */ 3679 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3680 3681 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3682 &xmm_mask_lo, &xmm_mask_hi); 3683 in_over_2x128 (&xmm_src, &xmm_src, 3684 &xmm_alpha, &xmm_alpha, 3685 &xmm_mask_lo, &xmm_mask_hi, 3686 &xmm_dst2, &xmm_dst3); 3687 } 3688 3689 save_128_aligned ( 3690 (__m128i*)dst, pack_565_4x128_128 ( 3691 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 3692 3693 w -= 8; 3694 dst += 8; 3695 } 3696 3697 while (w) 3698 { 3699 m = *mask++; 3700 3701 if (m) 3702 { 3703 d = *dst; 3704 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 3705 mmx_dest = expand565_16_1x128 (d); 3706 3707 *dst = pack_565_32_16 ( 3708 pack_1x128_32 ( 3709 in_over_1x128 ( 3710 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 3711 } 3712 3713 w--; 3714 dst++; 3715 } 3716 } 3717 3718 } 3719 3720 static void 3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, 3722 pixman_composite_info_t *info) 3723 { 3724 PIXMAN_COMPOSITE_ARGS (info); 3725 uint16_t *dst_line, *dst, d; 3726 uint32_t *src_line, *src, s; 3727 int dst_stride, src_stride; 3728 int32_t w; 3729 uint32_t opaque, zero; 3730 3731 __m128i ms; 3732 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 3733 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3734 3735 PIXMAN_IMAGE_GET_LINE ( 3736 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3737 PIXMAN_IMAGE_GET_LINE ( 3738 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3739 3740 while (height--) 3741 { 3742 dst = dst_line; 3743 dst_line += dst_stride; 3744 src = src_line; 3745 src_line += src_stride; 3746 w = width; 3747 3748 while (w && (uintptr_t)dst & 15) 3749 { 3750 s = *src++; 3751 d = *dst; 3752 3753 ms = unpack_32_1x128 (s); 3754 3755 *dst++ = pack_565_32_16 ( 3756 pack_1x128_32 ( 3757 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); 3758 w--; 3759 } 3760 3761 while (w >= 8) 3762 { 3763 /* First round */ 3764 xmm_src = load_128_unaligned ((__m128i*)src); 3765 xmm_dst = load_128_aligned ((__m128i*)dst); 3766 3767 opaque = is_opaque (xmm_src); 3768 zero = is_zero (xmm_src); 3769 3770 unpack_565_128_4x128 (xmm_dst, 3771 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3772 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3773 3774 /* preload next round*/ 3775 xmm_src = load_128_unaligned ((__m128i*)(src + 4)); 3776 3777 if (opaque) 3778 { 3779 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 3780 &xmm_dst0, &xmm_dst1); 3781 } 3782 else if (!zero) 3783 { 3784 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 3785 &xmm_dst0, &xmm_dst1); 3786 } 3787 3788 /* Second round */ 3789 opaque = is_opaque (xmm_src); 3790 zero = is_zero (xmm_src); 3791 3792 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3793 3794 if (opaque) 3795 { 3796 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 3797 &xmm_dst2, &xmm_dst3); 3798 } 3799 else if (!zero) 3800 { 3801 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 3802 &xmm_dst2, &xmm_dst3); 3803 } 3804 3805 save_128_aligned ( 3806 (__m128i*)dst, pack_565_4x128_128 ( 3807 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 3808 3809 w -= 8; 3810 src += 8; 3811 dst += 8; 3812 } 3813 3814 while (w) 3815 { 3816 s = *src++; 3817 d = *dst; 3818 3819 ms = unpack_32_1x128 (s); 3820 3821 *dst++ = pack_565_32_16 ( 3822 pack_1x128_32 ( 3823 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); 3824 w--; 3825 } 3826 } 3827 3828 } 3829 3830 static void 3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, 3832 pixman_composite_info_t *info) 3833 { 3834 PIXMAN_COMPOSITE_ARGS (info); 3835 uint32_t *dst_line, *dst, d; 3836 uint32_t *src_line, *src, s; 3837 int dst_stride, src_stride; 3838 int32_t w; 3839 uint32_t opaque, zero; 3840 3841 __m128i xmm_src_lo, xmm_src_hi; 3842 __m128i xmm_dst_lo, xmm_dst_hi; 3843 3844 PIXMAN_IMAGE_GET_LINE ( 3845 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3846 PIXMAN_IMAGE_GET_LINE ( 3847 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3848 3849 while (height--) 3850 { 3851 dst = dst_line; 3852 dst_line += dst_stride; 3853 src = src_line; 3854 src_line += src_stride; 3855 w = width; 3856 3857 while (w && (uintptr_t)dst & 15) 3858 { 3859 s = *src++; 3860 d = *dst; 3861 3862 *dst++ = pack_1x128_32 ( 3863 over_rev_non_pre_1x128 ( 3864 unpack_32_1x128 (s), unpack_32_1x128 (d))); 3865 3866 w--; 3867 } 3868 3869 while (w >= 4) 3870 { 3871 xmm_src_hi = load_128_unaligned ((__m128i*)src); 3872 3873 opaque = is_opaque (xmm_src_hi); 3874 zero = is_zero (xmm_src_hi); 3875 3876 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 3877 3878 if (opaque) 3879 { 3880 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 3881 &xmm_dst_lo, &xmm_dst_hi); 3882 3883 save_128_aligned ( 3884 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3885 } 3886 else if (!zero) 3887 { 3888 xmm_dst_hi = load_128_aligned ((__m128i*)dst); 3889 3890 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 3891 3892 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 3893 &xmm_dst_lo, &xmm_dst_hi); 3894 3895 save_128_aligned ( 3896 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3897 } 3898 3899 w -= 4; 3900 dst += 4; 3901 src += 4; 3902 } 3903 3904 while (w) 3905 { 3906 s = *src++; 3907 d = *dst; 3908 3909 *dst++ = pack_1x128_32 ( 3910 over_rev_non_pre_1x128 ( 3911 unpack_32_1x128 (s), unpack_32_1x128 (d))); 3912 3913 w--; 3914 } 3915 } 3916 3917 } 3918 3919 static void 3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, 3921 pixman_composite_info_t *info) 3922 { 3923 PIXMAN_COMPOSITE_ARGS (info); 3924 uint32_t src; 3925 uint16_t *dst_line, *dst, d; 3926 uint32_t *mask_line, *mask, m; 3927 int dst_stride, mask_stride; 3928 int w; 3929 uint32_t pack_cmp; 3930 3931 __m128i xmm_src, xmm_alpha; 3932 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3933 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3934 3935 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 3936 3937 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3938 3939 if (src == 0) 3940 return; 3941 3942 PIXMAN_IMAGE_GET_LINE ( 3943 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3944 PIXMAN_IMAGE_GET_LINE ( 3945 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 3946 3947 xmm_src = expand_pixel_32_1x128 (src); 3948 xmm_alpha = expand_alpha_1x128 (xmm_src); 3949 mmx_src = xmm_src; 3950 mmx_alpha = xmm_alpha; 3951 3952 while (height--) 3953 { 3954 w = width; 3955 mask = mask_line; 3956 dst = dst_line; 3957 mask_line += mask_stride; 3958 dst_line += dst_stride; 3959 3960 while (w && ((uintptr_t)dst & 15)) 3961 { 3962 m = *(uint32_t *) mask; 3963 3964 if (m) 3965 { 3966 d = *dst; 3967 mmx_mask = unpack_32_1x128 (m); 3968 mmx_dest = expand565_16_1x128 (d); 3969 3970 *dst = pack_565_32_16 ( 3971 pack_1x128_32 ( 3972 in_over_1x128 ( 3973 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 3974 } 3975 3976 w--; 3977 dst++; 3978 mask++; 3979 } 3980 3981 while (w >= 8) 3982 { 3983 /* First round */ 3984 xmm_mask = load_128_unaligned ((__m128i*)mask); 3985 xmm_dst = load_128_aligned ((__m128i*)dst); 3986 3987 pack_cmp = _mm_movemask_epi8 ( 3988 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 3989 3990 unpack_565_128_4x128 (xmm_dst, 3991 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3992 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3993 3994 /* preload next round */ 3995 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); 3996 3997 /* preload next round */ 3998 if (pack_cmp != 0xffff) 3999 { 4000 in_over_2x128 (&xmm_src, &xmm_src, 4001 &xmm_alpha, &xmm_alpha, 4002 &xmm_mask_lo, &xmm_mask_hi, 4003 &xmm_dst0, &xmm_dst1); 4004 } 4005 4006 /* Second round */ 4007 pack_cmp = _mm_movemask_epi8 ( 4008 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 4009 4010 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4011 4012 if (pack_cmp != 0xffff) 4013 { 4014 in_over_2x128 (&xmm_src, &xmm_src, 4015 &xmm_alpha, &xmm_alpha, 4016 &xmm_mask_lo, &xmm_mask_hi, 4017 &xmm_dst2, &xmm_dst3); 4018 } 4019 4020 save_128_aligned ( 4021 (__m128i*)dst, pack_565_4x128_128 ( 4022 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 4023 4024 w -= 8; 4025 dst += 8; 4026 mask += 8; 4027 } 4028 4029 while (w) 4030 { 4031 m = *(uint32_t *) mask; 4032 4033 if (m) 4034 { 4035 d = *dst; 4036 mmx_mask = unpack_32_1x128 (m); 4037 mmx_dest = expand565_16_1x128 (d); 4038 4039 *dst = pack_565_32_16 ( 4040 pack_1x128_32 ( 4041 in_over_1x128 ( 4042 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 4043 } 4044 4045 w--; 4046 dst++; 4047 mask++; 4048 } 4049 } 4050 4051 } 4052 4053 static void 4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp, 4055 pixman_composite_info_t *info) 4056 { 4057 PIXMAN_COMPOSITE_ARGS (info); 4058 uint8_t *dst_line, *dst; 4059 uint8_t *mask_line, *mask; 4060 int dst_stride, mask_stride; 4061 uint32_t d, m; 4062 uint32_t src; 4063 int32_t w; 4064 4065 __m128i xmm_alpha; 4066 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4067 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4068 4069 PIXMAN_IMAGE_GET_LINE ( 4070 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4071 PIXMAN_IMAGE_GET_LINE ( 4072 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4073 4074 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4075 4076 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 4077 4078 while (height--) 4079 { 4080 dst = dst_line; 4081 dst_line += dst_stride; 4082 mask = mask_line; 4083 mask_line += mask_stride; 4084 w = width; 4085 4086 while (w && ((uintptr_t)dst & 15)) 4087 { 4088 m = (uint32_t) *mask++; 4089 d = (uint32_t) *dst; 4090 4091 *dst++ = (uint8_t) pack_1x128_32 ( 4092 pix_multiply_1x128 ( 4093 pix_multiply_1x128 (xmm_alpha, 4094 unpack_32_1x128 (m)), 4095 unpack_32_1x128 (d))); 4096 w--; 4097 } 4098 4099 while (w >= 16) 4100 { 4101 xmm_mask = load_128_unaligned ((__m128i*)mask); 4102 xmm_dst = load_128_aligned ((__m128i*)dst); 4103 4104 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4105 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4106 4107 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 4108 &xmm_mask_lo, &xmm_mask_hi, 4109 &xmm_mask_lo, &xmm_mask_hi); 4110 4111 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 4112 &xmm_dst_lo, &xmm_dst_hi, 4113 &xmm_dst_lo, &xmm_dst_hi); 4114 4115 save_128_aligned ( 4116 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4117 4118 mask += 16; 4119 dst += 16; 4120 w -= 16; 4121 } 4122 4123 while (w) 4124 { 4125 m = (uint32_t) *mask++; 4126 d = (uint32_t) *dst; 4127 4128 *dst++ = (uint8_t) pack_1x128_32 ( 4129 pix_multiply_1x128 ( 4130 pix_multiply_1x128 ( 4131 xmm_alpha, unpack_32_1x128 (m)), 4132 unpack_32_1x128 (d))); 4133 w--; 4134 } 4135 } 4136 4137 } 4138 4139 static void 4140 sse2_composite_in_n_8 (pixman_implementation_t *imp, 4141 pixman_composite_info_t *info) 4142 { 4143 PIXMAN_COMPOSITE_ARGS (info); 4144 uint8_t *dst_line, *dst; 4145 int dst_stride; 4146 uint32_t d; 4147 uint32_t src; 4148 int32_t w; 4149 4150 __m128i xmm_alpha; 4151 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4152 4153 PIXMAN_IMAGE_GET_LINE ( 4154 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4155 4156 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4157 4158 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 4159 4160 src = src >> 24; 4161 4162 if (src == 0xff) 4163 return; 4164 4165 if (src == 0x00) 4166 { 4167 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 4168 8, dest_x, dest_y, width, height, src); 4169 4170 return; 4171 } 4172 4173 while (height--) 4174 { 4175 dst = dst_line; 4176 dst_line += dst_stride; 4177 w = width; 4178 4179 while (w && ((uintptr_t)dst & 15)) 4180 { 4181 d = (uint32_t) *dst; 4182 4183 *dst++ = (uint8_t) pack_1x128_32 ( 4184 pix_multiply_1x128 ( 4185 xmm_alpha, 4186 unpack_32_1x128 (d))); 4187 w--; 4188 } 4189 4190 while (w >= 16) 4191 { 4192 xmm_dst = load_128_aligned ((__m128i*)dst); 4193 4194 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4195 4196 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 4197 &xmm_dst_lo, &xmm_dst_hi, 4198 &xmm_dst_lo, &xmm_dst_hi); 4199 4200 save_128_aligned ( 4201 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4202 4203 dst += 16; 4204 w -= 16; 4205 } 4206 4207 while (w) 4208 { 4209 d = (uint32_t) *dst; 4210 4211 *dst++ = (uint8_t) pack_1x128_32 ( 4212 pix_multiply_1x128 ( 4213 xmm_alpha, 4214 unpack_32_1x128 (d))); 4215 w--; 4216 } 4217 } 4218 4219 } 4220 4221 static void 4222 sse2_composite_in_8_8 (pixman_implementation_t *imp, 4223 pixman_composite_info_t *info) 4224 { 4225 PIXMAN_COMPOSITE_ARGS (info); 4226 uint8_t *dst_line, *dst; 4227 uint8_t *src_line, *src; 4228 int src_stride, dst_stride; 4229 int32_t w; 4230 uint32_t s, d; 4231 4232 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 4233 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4234 4235 PIXMAN_IMAGE_GET_LINE ( 4236 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4237 PIXMAN_IMAGE_GET_LINE ( 4238 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 4239 4240 while (height--) 4241 { 4242 dst = dst_line; 4243 dst_line += dst_stride; 4244 src = src_line; 4245 src_line += src_stride; 4246 w = width; 4247 4248 while (w && ((uintptr_t)dst & 15)) 4249 { 4250 s = (uint32_t) *src++; 4251 d = (uint32_t) *dst; 4252 4253 *dst++ = (uint8_t) pack_1x128_32 ( 4254 pix_multiply_1x128 ( 4255 unpack_32_1x128 (s), unpack_32_1x128 (d))); 4256 w--; 4257 } 4258 4259 while (w >= 16) 4260 { 4261 xmm_src = load_128_unaligned ((__m128i*)src); 4262 xmm_dst = load_128_aligned ((__m128i*)dst); 4263 4264 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 4265 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4266 4267 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 4268 &xmm_dst_lo, &xmm_dst_hi, 4269 &xmm_dst_lo, &xmm_dst_hi); 4270 4271 save_128_aligned ( 4272 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4273 4274 src += 16; 4275 dst += 16; 4276 w -= 16; 4277 } 4278 4279 while (w) 4280 { 4281 s = (uint32_t) *src++; 4282 d = (uint32_t) *dst; 4283 4284 *dst++ = (uint8_t) pack_1x128_32 ( 4285 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); 4286 w--; 4287 } 4288 } 4289 4290 } 4291 4292 static void 4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp, 4294 pixman_composite_info_t *info) 4295 { 4296 PIXMAN_COMPOSITE_ARGS (info); 4297 uint8_t *dst_line, *dst; 4298 uint8_t *mask_line, *mask; 4299 int dst_stride, mask_stride; 4300 int32_t w; 4301 uint32_t src; 4302 uint32_t m, d; 4303 4304 __m128i xmm_alpha; 4305 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4306 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4307 4308 PIXMAN_IMAGE_GET_LINE ( 4309 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4310 PIXMAN_IMAGE_GET_LINE ( 4311 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4312 4313 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4314 4315 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 4316 4317 while (height--) 4318 { 4319 dst = dst_line; 4320 dst_line += dst_stride; 4321 mask = mask_line; 4322 mask_line += mask_stride; 4323 w = width; 4324 4325 while (w && ((uintptr_t)dst & 15)) 4326 { 4327 m = (uint32_t) *mask++; 4328 d = (uint32_t) *dst; 4329 4330 *dst++ = (uint8_t) pack_1x128_32 ( 4331 _mm_adds_epu16 ( 4332 pix_multiply_1x128 ( 4333 xmm_alpha, unpack_32_1x128 (m)), 4334 unpack_32_1x128 (d))); 4335 w--; 4336 } 4337 4338 while (w >= 16) 4339 { 4340 xmm_mask = load_128_unaligned ((__m128i*)mask); 4341 xmm_dst = load_128_aligned ((__m128i*)dst); 4342 4343 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4344 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4345 4346 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 4347 &xmm_mask_lo, &xmm_mask_hi, 4348 &xmm_mask_lo, &xmm_mask_hi); 4349 4350 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); 4351 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); 4352 4353 save_128_aligned ( 4354 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4355 4356 mask += 16; 4357 dst += 16; 4358 w -= 16; 4359 } 4360 4361 while (w) 4362 { 4363 m = (uint32_t) *mask++; 4364 d = (uint32_t) *dst; 4365 4366 *dst++ = (uint8_t) pack_1x128_32 ( 4367 _mm_adds_epu16 ( 4368 pix_multiply_1x128 ( 4369 xmm_alpha, unpack_32_1x128 (m)), 4370 unpack_32_1x128 (d))); 4371 4372 w--; 4373 } 4374 } 4375 4376 } 4377 4378 static void 4379 sse2_composite_add_n_8 (pixman_implementation_t *imp, 4380 pixman_composite_info_t *info) 4381 { 4382 PIXMAN_COMPOSITE_ARGS (info); 4383 uint8_t *dst_line, *dst; 4384 int dst_stride; 4385 int32_t w; 4386 uint32_t src; 4387 4388 __m128i xmm_src; 4389 4390 PIXMAN_IMAGE_GET_LINE ( 4391 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4392 4393 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4394 4395 src >>= 24; 4396 4397 if (src == 0x00) 4398 return; 4399 4400 if (src == 0xff) 4401 { 4402 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 4403 8, dest_x, dest_y, width, height, 0xff); 4404 4405 return; 4406 } 4407 4408 src = (src << 24) | (src << 16) | (src << 8) | src; 4409 xmm_src = _mm_set_epi32 (src, src, src, src); 4410 4411 while (height--) 4412 { 4413 dst = dst_line; 4414 dst_line += dst_stride; 4415 w = width; 4416 4417 while (w && ((uintptr_t)dst & 15)) 4418 { 4419 *dst = (uint8_t)_mm_cvtsi128_si32 ( 4420 _mm_adds_epu8 ( 4421 xmm_src, 4422 _mm_cvtsi32_si128 (*dst))); 4423 4424 w--; 4425 dst++; 4426 } 4427 4428 while (w >= 16) 4429 { 4430 save_128_aligned ( 4431 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); 4432 4433 dst += 16; 4434 w -= 16; 4435 } 4436 4437 while (w) 4438 { 4439 *dst = (uint8_t)_mm_cvtsi128_si32 ( 4440 _mm_adds_epu8 ( 4441 xmm_src, 4442 _mm_cvtsi32_si128 (*dst))); 4443 4444 w--; 4445 dst++; 4446 } 4447 } 4448 4449 } 4450 4451 static void 4452 sse2_composite_add_8_8 (pixman_implementation_t *imp, 4453 pixman_composite_info_t *info) 4454 { 4455 PIXMAN_COMPOSITE_ARGS (info); 4456 uint8_t *dst_line, *dst; 4457 uint8_t *src_line, *src; 4458 int dst_stride, src_stride; 4459 int32_t w; 4460 uint16_t t; 4461 4462 PIXMAN_IMAGE_GET_LINE ( 4463 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 4464 PIXMAN_IMAGE_GET_LINE ( 4465 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4466 4467 while (height--) 4468 { 4469 dst = dst_line; 4470 src = src_line; 4471 4472 dst_line += dst_stride; 4473 src_line += src_stride; 4474 w = width; 4475 4476 /* Small head */ 4477 while (w && (uintptr_t)dst & 3) 4478 { 4479 t = (*dst) + (*src++); 4480 *dst++ = t | (0 - (t >> 8)); 4481 w--; 4482 } 4483 4484 sse2_combine_add_u (imp, op, 4485 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); 4486 4487 /* Small tail */ 4488 dst += w & 0xfffc; 4489 src += w & 0xfffc; 4490 4491 w &= 3; 4492 4493 while (w) 4494 { 4495 t = (*dst) + (*src++); 4496 *dst++ = t | (0 - (t >> 8)); 4497 w--; 4498 } 4499 } 4500 4501 } 4502 4503 static void 4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp, 4505 pixman_composite_info_t *info) 4506 { 4507 PIXMAN_COMPOSITE_ARGS (info); 4508 uint32_t *dst_line, *dst; 4509 uint32_t *src_line, *src; 4510 int dst_stride, src_stride; 4511 4512 PIXMAN_IMAGE_GET_LINE ( 4513 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 4514 PIXMAN_IMAGE_GET_LINE ( 4515 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4516 4517 while (height--) 4518 { 4519 dst = dst_line; 4520 dst_line += dst_stride; 4521 src = src_line; 4522 src_line += src_stride; 4523 4524 sse2_combine_add_u (imp, op, dst, src, NULL, width); 4525 } 4526 } 4527 4528 static void 4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp, 4530 pixman_composite_info_t *info) 4531 { 4532 PIXMAN_COMPOSITE_ARGS (info); 4533 uint32_t *dst_line, *dst, src; 4534 int dst_stride; 4535 4536 __m128i xmm_src; 4537 4538 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4539 4540 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4541 if (src == 0) 4542 return; 4543 4544 if (src == ~0) 4545 { 4546 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, 4547 dest_x, dest_y, width, height, ~0); 4548 4549 return; 4550 } 4551 4552 xmm_src = _mm_set_epi32 (src, src, src, src); 4553 while (height--) 4554 { 4555 int w = width; 4556 uint32_t d; 4557 4558 dst = dst_line; 4559 dst_line += dst_stride; 4560 4561 while (w && (uintptr_t)dst & 15) 4562 { 4563 d = *dst; 4564 *dst++ = 4565 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); 4566 w--; 4567 } 4568 4569 while (w >= 4) 4570 { 4571 save_128_aligned 4572 ((__m128i*)dst, 4573 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); 4574 4575 dst += 4; 4576 w -= 4; 4577 } 4578 4579 while (w--) 4580 { 4581 d = *dst; 4582 *dst++ = 4583 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, 4584 _mm_cvtsi32_si128 (d))); 4585 } 4586 } 4587 } 4588 4589 static void 4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, 4591 pixman_composite_info_t *info) 4592 { 4593 PIXMAN_COMPOSITE_ARGS (info); 4594 uint32_t *dst_line, *dst; 4595 uint8_t *mask_line, *mask; 4596 int dst_stride, mask_stride; 4597 int32_t w; 4598 uint32_t src; 4599 4600 __m128i xmm_src; 4601 4602 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4603 if (src == 0) 4604 return; 4605 xmm_src = expand_pixel_32_1x128 (src); 4606 4607 PIXMAN_IMAGE_GET_LINE ( 4608 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4609 PIXMAN_IMAGE_GET_LINE ( 4610 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4611 4612 while (height--) 4613 { 4614 dst = dst_line; 4615 dst_line += dst_stride; 4616 mask = mask_line; 4617 mask_line += mask_stride; 4618 w = width; 4619 4620 while (w && ((uintptr_t)dst & 15)) 4621 { 4622 uint8_t m = *mask++; 4623 if (m) 4624 { 4625 *dst = pack_1x128_32 4626 (_mm_adds_epu16 4627 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), 4628 unpack_32_1x128 (*dst))); 4629 } 4630 dst++; 4631 w--; 4632 } 4633 4634 while (w >= 4) 4635 { 4636 uint32_t m = *(uint32_t*)mask; 4637 if (m) 4638 { 4639 __m128i xmm_mask_lo, xmm_mask_hi; 4640 __m128i xmm_dst_lo, xmm_dst_hi; 4641 4642 __m128i xmm_dst = load_128_aligned ((__m128i*)dst); 4643 __m128i xmm_mask = 4644 _mm_unpacklo_epi8 (unpack_32_1x128(m), 4645 _mm_setzero_si128 ()); 4646 4647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4648 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4649 4650 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 4651 &xmm_mask_lo, &xmm_mask_hi); 4652 4653 pix_multiply_2x128 (&xmm_src, &xmm_src, 4654 &xmm_mask_lo, &xmm_mask_hi, 4655 &xmm_mask_lo, &xmm_mask_hi); 4656 4657 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); 4658 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); 4659 4660 save_128_aligned ( 4661 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4662 } 4663 4664 w -= 4; 4665 dst += 4; 4666 mask += 4; 4667 } 4668 4669 while (w) 4670 { 4671 uint8_t m = *mask++; 4672 if (m) 4673 { 4674 *dst = pack_1x128_32 4675 (_mm_adds_epu16 4676 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), 4677 unpack_32_1x128 (*dst))); 4678 } 4679 dst++; 4680 w--; 4681 } 4682 } 4683 } 4684 4685 static pixman_bool_t 4686 sse2_blt (pixman_implementation_t *imp, 4687 uint32_t * src_bits, 4688 uint32_t * dst_bits, 4689 int src_stride, 4690 int dst_stride, 4691 int src_bpp, 4692 int dst_bpp, 4693 int src_x, 4694 int src_y, 4695 int dest_x, 4696 int dest_y, 4697 int width, 4698 int height) 4699 { 4700 uint8_t * src_bytes; 4701 uint8_t * dst_bytes; 4702 int byte_width; 4703 4704 if (src_bpp != dst_bpp) 4705 return FALSE; 4706 4707 if (src_bpp == 16) 4708 { 4709 src_stride = src_stride * (int) sizeof (uint32_t) / 2; 4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; 4711 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); 4712 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 4713 byte_width = 2 * width; 4714 src_stride *= 2; 4715 dst_stride *= 2; 4716 } 4717 else if (src_bpp == 32) 4718 { 4719 src_stride = src_stride * (int) sizeof (uint32_t) / 4; 4720 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; 4721 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); 4722 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 4723 byte_width = 4 * width; 4724 src_stride *= 4; 4725 dst_stride *= 4; 4726 } 4727 else 4728 { 4729 return FALSE; 4730 } 4731 4732 while (height--) 4733 { 4734 int w; 4735 uint8_t *s = src_bytes; 4736 uint8_t *d = dst_bytes; 4737 src_bytes += src_stride; 4738 dst_bytes += dst_stride; 4739 w = byte_width; 4740 4741 while (w >= 2 && ((uintptr_t)d & 3)) 4742 { 4743 *(uint16_t *)d = *(uint16_t *)s; 4744 w -= 2; 4745 s += 2; 4746 d += 2; 4747 } 4748 4749 while (w >= 4 && ((uintptr_t)d & 15)) 4750 { 4751 *(uint32_t *)d = *(uint32_t *)s; 4752 4753 w -= 4; 4754 s += 4; 4755 d += 4; 4756 } 4757 4758 while (w >= 64) 4759 { 4760 __m128i xmm0, xmm1, xmm2, xmm3; 4761 4762 xmm0 = load_128_unaligned ((__m128i*)(s)); 4763 xmm1 = load_128_unaligned ((__m128i*)(s + 16)); 4764 xmm2 = load_128_unaligned ((__m128i*)(s + 32)); 4765 xmm3 = load_128_unaligned ((__m128i*)(s + 48)); 4766 4767 save_128_aligned ((__m128i*)(d), xmm0); 4768 save_128_aligned ((__m128i*)(d + 16), xmm1); 4769 save_128_aligned ((__m128i*)(d + 32), xmm2); 4770 save_128_aligned ((__m128i*)(d + 48), xmm3); 4771 4772 s += 64; 4773 d += 64; 4774 w -= 64; 4775 } 4776 4777 while (w >= 16) 4778 { 4779 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); 4780 4781 w -= 16; 4782 d += 16; 4783 s += 16; 4784 } 4785 4786 while (w >= 4) 4787 { 4788 *(uint32_t *)d = *(uint32_t *)s; 4789 4790 w -= 4; 4791 s += 4; 4792 d += 4; 4793 } 4794 4795 if (w >= 2) 4796 { 4797 *(uint16_t *)d = *(uint16_t *)s; 4798 w -= 2; 4799 s += 2; 4800 d += 2; 4801 } 4802 } 4803 4804 return TRUE; 4805 } 4806 4807 static void 4808 sse2_composite_copy_area (pixman_implementation_t *imp, 4809 pixman_composite_info_t *info) 4810 { 4811 PIXMAN_COMPOSITE_ARGS (info); 4812 sse2_blt (imp, src_image->bits.bits, 4813 dest_image->bits.bits, 4814 src_image->bits.rowstride, 4815 dest_image->bits.rowstride, 4816 PIXMAN_FORMAT_BPP (src_image->bits.format), 4817 PIXMAN_FORMAT_BPP (dest_image->bits.format), 4818 src_x, src_y, dest_x, dest_y, width, height); 4819 } 4820 4821 static void 4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, 4823 pixman_composite_info_t *info) 4824 { 4825 PIXMAN_COMPOSITE_ARGS (info); 4826 uint32_t *src, *src_line, s; 4827 uint32_t *dst, *dst_line, d; 4828 uint8_t *mask, *mask_line; 4829 uint32_t m; 4830 int src_stride, mask_stride, dst_stride; 4831 int32_t w; 4832 __m128i ms; 4833 4834 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 4835 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4836 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4837 4838 PIXMAN_IMAGE_GET_LINE ( 4839 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4840 PIXMAN_IMAGE_GET_LINE ( 4841 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4842 PIXMAN_IMAGE_GET_LINE ( 4843 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 4844 4845 while (height--) 4846 { 4847 src = src_line; 4848 src_line += src_stride; 4849 dst = dst_line; 4850 dst_line += dst_stride; 4851 mask = mask_line; 4852 mask_line += mask_stride; 4853 4854 w = width; 4855 4856 while (w && (uintptr_t)dst & 15) 4857 { 4858 s = 0xff000000 | *src++; 4859 m = (uint32_t) *mask++; 4860 d = *dst; 4861 ms = unpack_32_1x128 (s); 4862 4863 if (m != 0xff) 4864 { 4865 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 4866 __m128i md = unpack_32_1x128 (d); 4867 4868 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); 4869 } 4870 4871 *dst++ = pack_1x128_32 (ms); 4872 w--; 4873 } 4874 4875 while (w >= 4) 4876 { 4877 m = *(uint32_t*) mask; 4878 xmm_src = _mm_or_si128 ( 4879 load_128_unaligned ((__m128i*)src), mask_ff000000); 4880 4881 if (m == 0xffffffff) 4882 { 4883 save_128_aligned ((__m128i*)dst, xmm_src); 4884 } 4885 else 4886 { 4887 xmm_dst = load_128_aligned ((__m128i*)dst); 4888 4889 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 4890 4891 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 4892 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4893 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4894 4895 expand_alpha_rev_2x128 ( 4896 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 4897 4898 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 4899 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, 4900 &xmm_dst_lo, &xmm_dst_hi); 4901 4902 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4903 } 4904 4905 src += 4; 4906 dst += 4; 4907 mask += 4; 4908 w -= 4; 4909 } 4910 4911 while (w) 4912 { 4913 m = (uint32_t) *mask++; 4914 4915 if (m) 4916 { 4917 s = 0xff000000 | *src; 4918 4919 if (m == 0xff) 4920 { 4921 *dst = s; 4922 } 4923 else 4924 { 4925 __m128i ma, md, ms; 4926 4927 d = *dst; 4928 4929 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 4930 md = unpack_32_1x128 (d); 4931 ms = unpack_32_1x128 (s); 4932 4933 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); 4934 } 4935 4936 } 4937 4938 src++; 4939 dst++; 4940 w--; 4941 } 4942 } 4943 4944 } 4945 4946 static void 4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, 4948 pixman_composite_info_t *info) 4949 { 4950 PIXMAN_COMPOSITE_ARGS (info); 4951 uint32_t *src, *src_line, s; 4952 uint32_t *dst, *dst_line, d; 4953 uint8_t *mask, *mask_line; 4954 uint32_t m; 4955 int src_stride, mask_stride, dst_stride; 4956 int32_t w; 4957 4958 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 4959 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4960 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4961 4962 PIXMAN_IMAGE_GET_LINE ( 4963 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4964 PIXMAN_IMAGE_GET_LINE ( 4965 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4966 PIXMAN_IMAGE_GET_LINE ( 4967 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 4968 4969 while (height--) 4970 { 4971 src = src_line; 4972 src_line += src_stride; 4973 dst = dst_line; 4974 dst_line += dst_stride; 4975 mask = mask_line; 4976 mask_line += mask_stride; 4977 4978 w = width; 4979 4980 while (w && (uintptr_t)dst & 15) 4981 { 4982 uint32_t sa; 4983 4984 s = *src++; 4985 m = (uint32_t) *mask++; 4986 d = *dst; 4987 4988 sa = s >> 24; 4989 4990 if (m) 4991 { 4992 if (sa == 0xff && m == 0xff) 4993 { 4994 *dst = s; 4995 } 4996 else 4997 { 4998 __m128i ms, md, ma, msa; 4999 5000 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5001 ms = unpack_32_1x128 (s); 5002 md = unpack_32_1x128 (d); 5003 5004 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5005 5006 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5007 } 5008 } 5009 5010 dst++; 5011 w--; 5012 } 5013 5014 while (w >= 4) 5015 { 5016 m = *(uint32_t *) mask; 5017 5018 if (m) 5019 { 5020 xmm_src = load_128_unaligned ((__m128i*)src); 5021 5022 if (m == 0xffffffff && is_opaque (xmm_src)) 5023 { 5024 save_128_aligned ((__m128i *)dst, xmm_src); 5025 } 5026 else 5027 { 5028 xmm_dst = load_128_aligned ((__m128i *)dst); 5029 5030 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 5031 5032 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5033 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 5034 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5035 5036 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 5037 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 5038 5039 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 5040 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 5041 5042 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5043 } 5044 } 5045 5046 src += 4; 5047 dst += 4; 5048 mask += 4; 5049 w -= 4; 5050 } 5051 5052 while (w) 5053 { 5054 uint32_t sa; 5055 5056 s = *src++; 5057 m = (uint32_t) *mask++; 5058 d = *dst; 5059 5060 sa = s >> 24; 5061 5062 if (m) 5063 { 5064 if (sa == 0xff && m == 0xff) 5065 { 5066 *dst = s; 5067 } 5068 else 5069 { 5070 __m128i ms, md, ma, msa; 5071 5072 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5073 ms = unpack_32_1x128 (s); 5074 md = unpack_32_1x128 (d); 5075 5076 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5077 5078 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5079 } 5080 } 5081 5082 dst++; 5083 w--; 5084 } 5085 } 5086 5087 } 5088 5089 static void 5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, 5091 pixman_composite_info_t *info) 5092 { 5093 PIXMAN_COMPOSITE_ARGS (info); 5094 uint32_t src; 5095 uint32_t *dst_line, *dst; 5096 __m128i xmm_src; 5097 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5098 __m128i xmm_dsta_hi, xmm_dsta_lo; 5099 int dst_stride; 5100 int32_t w; 5101 5102 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 5103 5104 if (src == 0) 5105 return; 5106 5107 PIXMAN_IMAGE_GET_LINE ( 5108 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 5109 5110 xmm_src = expand_pixel_32_1x128 (src); 5111 5112 while (height--) 5113 { 5114 dst = dst_line; 5115 5116 dst_line += dst_stride; 5117 w = width; 5118 5119 while (w && (uintptr_t)dst & 15) 5120 { 5121 __m128i vd; 5122 5123 vd = unpack_32_1x128 (*dst); 5124 5125 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), 5126 xmm_src)); 5127 w--; 5128 dst++; 5129 } 5130 5131 while (w >= 4) 5132 { 5133 __m128i tmp_lo, tmp_hi; 5134 5135 xmm_dst = load_128_aligned ((__m128i*)dst); 5136 5137 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); 5139 5140 tmp_lo = xmm_src; 5141 tmp_hi = xmm_src; 5142 5143 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 5144 &xmm_dsta_lo, &xmm_dsta_hi, 5145 &tmp_lo, &tmp_hi); 5146 5147 save_128_aligned ( 5148 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); 5149 5150 w -= 4; 5151 dst += 4; 5152 } 5153 5154 while (w) 5155 { 5156 __m128i vd; 5157 5158 vd = unpack_32_1x128 (*dst); 5159 5160 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), 5161 xmm_src)); 5162 w--; 5163 dst++; 5164 } 5165 5166 } 5167 5168 } 5169 5170 static void 5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, 5172 pixman_composite_info_t *info) 5173 { 5174 PIXMAN_COMPOSITE_ARGS (info); 5175 uint32_t *src, *src_line, s; 5176 uint32_t *dst, *dst_line, d; 5177 uint32_t *mask, *mask_line; 5178 uint32_t m; 5179 int src_stride, mask_stride, dst_stride; 5180 int32_t w; 5181 5182 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 5183 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5184 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 5185 5186 PIXMAN_IMAGE_GET_LINE ( 5187 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 5188 PIXMAN_IMAGE_GET_LINE ( 5189 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 5190 PIXMAN_IMAGE_GET_LINE ( 5191 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 5192 5193 while (height--) 5194 { 5195 src = src_line; 5196 src_line += src_stride; 5197 dst = dst_line; 5198 dst_line += dst_stride; 5199 mask = mask_line; 5200 mask_line += mask_stride; 5201 5202 w = width; 5203 5204 while (w && (uintptr_t)dst & 15) 5205 { 5206 uint32_t sa; 5207 5208 s = *src++; 5209 m = (*mask++) >> 24; 5210 d = *dst; 5211 5212 sa = s >> 24; 5213 5214 if (m) 5215 { 5216 if (sa == 0xff && m == 0xff) 5217 { 5218 *dst = s; 5219 } 5220 else 5221 { 5222 __m128i ms, md, ma, msa; 5223 5224 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5225 ms = unpack_32_1x128 (s); 5226 md = unpack_32_1x128 (d); 5227 5228 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5229 5230 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5231 } 5232 } 5233 5234 dst++; 5235 w--; 5236 } 5237 5238 while (w >= 4) 5239 { 5240 xmm_mask = load_128_unaligned ((__m128i*)mask); 5241 5242 if (!is_transparent (xmm_mask)) 5243 { 5244 xmm_src = load_128_unaligned ((__m128i*)src); 5245 5246 if (is_opaque (xmm_mask) && is_opaque (xmm_src)) 5247 { 5248 save_128_aligned ((__m128i *)dst, xmm_src); 5249 } 5250 else 5251 { 5252 xmm_dst = load_128_aligned ((__m128i *)dst); 5253 5254 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5255 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 5256 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5257 5258 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 5259 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 5260 5261 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 5262 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 5263 5264 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5265 } 5266 } 5267 5268 src += 4; 5269 dst += 4; 5270 mask += 4; 5271 w -= 4; 5272 } 5273 5274 while (w) 5275 { 5276 uint32_t sa; 5277 5278 s = *src++; 5279 m = (*mask++) >> 24; 5280 d = *dst; 5281 5282 sa = s >> 24; 5283 5284 if (m) 5285 { 5286 if (sa == 0xff && m == 0xff) 5287 { 5288 *dst = s; 5289 } 5290 else 5291 { 5292 __m128i ms, md, ma, msa; 5293 5294 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5295 ms = unpack_32_1x128 (s); 5296 md = unpack_32_1x128 (d); 5297 5298 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5299 5300 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5301 } 5302 } 5303 5304 dst++; 5305 w--; 5306 } 5307 } 5308 5309 } 5310 5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */ 5312 static force_inline void 5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, 5314 const uint32_t* ps, 5315 int32_t w, 5316 pixman_fixed_t vx, 5317 pixman_fixed_t unit_x, 5318 pixman_fixed_t src_width_fixed, 5319 pixman_bool_t fully_transparent_src) 5320 { 5321 uint32_t s, d; 5322 const uint32_t* pm = NULL; 5323 5324 __m128i xmm_dst_lo, xmm_dst_hi; 5325 __m128i xmm_src_lo, xmm_src_hi; 5326 __m128i xmm_alpha_lo, xmm_alpha_hi; 5327 5328 if (fully_transparent_src) 5329 return; 5330 5331 /* Align dst on a 16-byte boundary */ 5332 while (w && ((uintptr_t)pd & 15)) 5333 { 5334 d = *pd; 5335 s = combine1 (ps + pixman_fixed_to_int (vx), pm); 5336 vx += unit_x; 5337 while (vx >= 0) 5338 vx -= src_width_fixed; 5339 5340 *pd++ = core_combine_over_u_pixel_sse2 (s, d); 5341 if (pm) 5342 pm++; 5343 w--; 5344 } 5345 5346 while (w >= 4) 5347 { 5348 __m128i tmp; 5349 uint32_t tmp1, tmp2, tmp3, tmp4; 5350 5351 tmp1 = *(ps + pixman_fixed_to_int (vx)); 5352 vx += unit_x; 5353 while (vx >= 0) 5354 vx -= src_width_fixed; 5355 tmp2 = *(ps + pixman_fixed_to_int (vx)); 5356 vx += unit_x; 5357 while (vx >= 0) 5358 vx -= src_width_fixed; 5359 tmp3 = *(ps + pixman_fixed_to_int (vx)); 5360 vx += unit_x; 5361 while (vx >= 0) 5362 vx -= src_width_fixed; 5363 tmp4 = *(ps + pixman_fixed_to_int (vx)); 5364 vx += unit_x; 5365 while (vx >= 0) 5366 vx -= src_width_fixed; 5367 5368 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); 5369 5370 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); 5371 5372 if (is_opaque (xmm_src_hi)) 5373 { 5374 save_128_aligned ((__m128i*)pd, xmm_src_hi); 5375 } 5376 else if (!is_zero (xmm_src_hi)) 5377 { 5378 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 5379 5380 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 5381 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 5382 5383 expand_alpha_2x128 ( 5384 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); 5385 5386 over_2x128 (&xmm_src_lo, &xmm_src_hi, 5387 &xmm_alpha_lo, &xmm_alpha_hi, 5388 &xmm_dst_lo, &xmm_dst_hi); 5389 5390 /* rebuid the 4 pixel data and save*/ 5391 save_128_aligned ((__m128i*)pd, 5392 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5393 } 5394 5395 w -= 4; 5396 pd += 4; 5397 if (pm) 5398 pm += 4; 5399 } 5400 5401 while (w) 5402 { 5403 d = *pd; 5404 s = combine1 (ps + pixman_fixed_to_int (vx), pm); 5405 vx += unit_x; 5406 while (vx >= 0) 5407 vx -= src_width_fixed; 5408 5409 *pd++ = core_combine_over_u_pixel_sse2 (s, d); 5410 if (pm) 5411 pm++; 5412 5413 w--; 5414 } 5415 } 5416 5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, 5418 scaled_nearest_scanline_sse2_8888_8888_OVER, 5419 uint32_t, uint32_t, COVER) 5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, 5421 scaled_nearest_scanline_sse2_8888_8888_OVER, 5422 uint32_t, uint32_t, NONE) 5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, 5424 scaled_nearest_scanline_sse2_8888_8888_OVER, 5425 uint32_t, uint32_t, PAD) 5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, 5427 scaled_nearest_scanline_sse2_8888_8888_OVER, 5428 uint32_t, uint32_t, NORMAL) 5429 5430 static force_inline void 5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, 5432 uint32_t * dst, 5433 const uint32_t * src, 5434 int32_t w, 5435 pixman_fixed_t vx, 5436 pixman_fixed_t unit_x, 5437 pixman_fixed_t src_width_fixed, 5438 pixman_bool_t zero_src) 5439 { 5440 __m128i xmm_mask; 5441 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 5442 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5443 __m128i xmm_alpha_lo, xmm_alpha_hi; 5444 5445 if (zero_src || (*mask >> 24) == 0) 5446 return; 5447 5448 xmm_mask = create_mask_16_128 (*mask >> 24); 5449 5450 while (w && (uintptr_t)dst & 15) 5451 { 5452 uint32_t s = *(src + pixman_fixed_to_int (vx)); 5453 vx += unit_x; 5454 while (vx >= 0) 5455 vx -= src_width_fixed; 5456 5457 if (s) 5458 { 5459 uint32_t d = *dst; 5460 5461 __m128i ms = unpack_32_1x128 (s); 5462 __m128i alpha = expand_alpha_1x128 (ms); 5463 __m128i dest = xmm_mask; 5464 __m128i alpha_dst = unpack_32_1x128 (d); 5465 5466 *dst = pack_1x128_32 ( 5467 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 5468 } 5469 dst++; 5470 w--; 5471 } 5472 5473 while (w >= 4) 5474 { 5475 uint32_t tmp1, tmp2, tmp3, tmp4; 5476 5477 tmp1 = *(src + pixman_fixed_to_int (vx)); 5478 vx += unit_x; 5479 while (vx >= 0) 5480 vx -= src_width_fixed; 5481 tmp2 = *(src + pixman_fixed_to_int (vx)); 5482 vx += unit_x; 5483 while (vx >= 0) 5484 vx -= src_width_fixed; 5485 tmp3 = *(src + pixman_fixed_to_int (vx)); 5486 vx += unit_x; 5487 while (vx >= 0) 5488 vx -= src_width_fixed; 5489 tmp4 = *(src + pixman_fixed_to_int (vx)); 5490 vx += unit_x; 5491 while (vx >= 0) 5492 vx -= src_width_fixed; 5493 5494 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); 5495 5496 if (!is_zero (xmm_src)) 5497 { 5498 xmm_dst = load_128_aligned ((__m128i*)dst); 5499 5500 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5501 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5502 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 5503 &xmm_alpha_lo, &xmm_alpha_hi); 5504 5505 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 5506 &xmm_alpha_lo, &xmm_alpha_hi, 5507 &xmm_mask, &xmm_mask, 5508 &xmm_dst_lo, &xmm_dst_hi); 5509 5510 save_128_aligned ( 5511 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5512 } 5513 5514 dst += 4; 5515 w -= 4; 5516 } 5517 5518 while (w) 5519 { 5520 uint32_t s = *(src + pixman_fixed_to_int (vx)); 5521 vx += unit_x; 5522 while (vx >= 0) 5523 vx -= src_width_fixed; 5524 5525 if (s) 5526 { 5527 uint32_t d = *dst; 5528 5529 __m128i ms = unpack_32_1x128 (s); 5530 __m128i alpha = expand_alpha_1x128 (ms); 5531 __m128i mask = xmm_mask; 5532 __m128i dest = unpack_32_1x128 (d); 5533 5534 *dst = pack_1x128_32 ( 5535 in_over_1x128 (&ms, &alpha, &mask, &dest)); 5536 } 5537 5538 dst++; 5539 w--; 5540 } 5541 5542 } 5543 5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, 5545 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5546 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) 5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, 5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5549 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) 5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, 5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5552 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) 5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, 5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5555 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) 5556 5557 #if BILINEAR_INTERPOLATION_BITS < 8 5558 # define BILINEAR_DECLARE_VARIABLES \ 5559 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ 5560 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ 5561 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ 5562 const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ 5563 unit_x, -unit_x, unit_x, -unit_x); \ 5564 const __m128i xmm_zero = _mm_setzero_si128 (); \ 5565 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \ 5566 vx, -(vx + 1), vx, -(vx + 1)) 5567 #else 5568 # define BILINEAR_DECLARE_VARIABLES \ 5569 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ 5570 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ 5571 const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ 5572 const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ 5573 -unit_x, -unit_x, -unit_x, -unit_x); \ 5574 const __m128i xmm_zero = _mm_setzero_si128 (); \ 5575 __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, \ 5576 -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1)) 5577 #endif 5578 5579 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ 5580 do { \ 5581 __m128i xmm_wh, xmm_lo, xmm_hi, a; \ 5582 /* fetch 2x2 pixel block into sse2 registers */ \ 5583 __m128i tltr = _mm_loadl_epi64 ( \ 5584 (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ 5585 __m128i blbr = _mm_loadl_epi64 ( \ 5586 (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ 5587 vx += unit_x; \ 5588 /* vertical interpolation */ \ 5589 a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ 5590 xmm_wt), \ 5591 _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ 5592 xmm_wb)); \ 5593 if (BILINEAR_INTERPOLATION_BITS < 8) \ 5594 { \ 5595 /* calculate horizontal weights */ \ 5596 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ 5597 16 - BILINEAR_INTERPOLATION_BITS)); \ 5598 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ 5599 /* horizontal interpolation */ \ 5600 a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ 5601 a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ 5602 } \ 5603 else \ 5604 { \ 5605 /* calculate horizontal weights */ \ 5606 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ 5607 16 - BILINEAR_INTERPOLATION_BITS)); \ 5608 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ 5609 /* horizontal interpolation */ \ 5610 xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ 5611 xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ 5612 a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ 5613 _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ 5614 } \ 5615 /* shift and pack the result */ \ 5616 a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ 5617 a = _mm_packs_epi32 (a, a); \ 5618 a = _mm_packus_epi16 (a, a); \ 5619 pix = _mm_cvtsi128_si32 (a); \ 5620 } while (0) 5621 5622 #define BILINEAR_SKIP_ONE_PIXEL() \ 5623 do { \ 5624 vx += unit_x; \ 5625 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ 5626 } while(0) 5627 5628 static force_inline void 5629 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, 5630 const uint32_t * mask, 5631 const uint32_t * src_top, 5632 const uint32_t * src_bottom, 5633 int32_t w, 5634 int wt, 5635 int wb, 5636 pixman_fixed_t vx, 5637 pixman_fixed_t unit_x, 5638 pixman_fixed_t max_vx, 5639 pixman_bool_t zero_src) 5640 { 5641 BILINEAR_DECLARE_VARIABLES; 5642 uint32_t pix1, pix2, pix3, pix4; 5643 5644 while ((w -= 4) >= 0) 5645 { 5646 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5647 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5648 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 5649 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 5650 *dst++ = pix1; 5651 *dst++ = pix2; 5652 *dst++ = pix3; 5653 *dst++ = pix4; 5654 } 5655 5656 if (w & 2) 5657 { 5658 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5659 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5660 *dst++ = pix1; 5661 *dst++ = pix2; 5662 } 5663 5664 if (w & 1) 5665 { 5666 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5667 *dst = pix1; 5668 } 5669 5670 } 5671 5672 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, 5673 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5674 uint32_t, uint32_t, uint32_t, 5675 COVER, FLAG_NONE) 5676 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, 5677 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5678 uint32_t, uint32_t, uint32_t, 5679 PAD, FLAG_NONE) 5680 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, 5681 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5682 uint32_t, uint32_t, uint32_t, 5683 NONE, FLAG_NONE) 5684 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, 5685 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5686 uint32_t, uint32_t, uint32_t, 5687 NORMAL, FLAG_NONE) 5688 5689 static force_inline void 5690 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, 5691 const uint32_t * mask, 5692 const uint32_t * src_top, 5693 const uint32_t * src_bottom, 5694 int32_t w, 5695 int wt, 5696 int wb, 5697 pixman_fixed_t vx, 5698 pixman_fixed_t unit_x, 5699 pixman_fixed_t max_vx, 5700 pixman_bool_t zero_src) 5701 { 5702 BILINEAR_DECLARE_VARIABLES; 5703 uint32_t pix1, pix2, pix3, pix4; 5704 5705 while (w && ((uintptr_t)dst & 15)) 5706 { 5707 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5708 5709 if (pix1) 5710 { 5711 pix2 = *dst; 5712 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); 5713 } 5714 5715 w--; 5716 dst++; 5717 } 5718 5719 while (w >= 4) 5720 { 5721 __m128i xmm_src; 5722 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; 5723 __m128i xmm_alpha_hi, xmm_alpha_lo; 5724 5725 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5726 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5727 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 5728 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 5729 5730 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); 5731 5732 if (!is_zero (xmm_src)) 5733 { 5734 if (is_opaque (xmm_src)) 5735 { 5736 save_128_aligned ((__m128i *)dst, xmm_src); 5737 } 5738 else 5739 { 5740 __m128i xmm_dst = load_128_aligned ((__m128i *)dst); 5741 5742 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5743 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5744 5745 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); 5746 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, 5747 &xmm_dst_lo, &xmm_dst_hi); 5748 5749 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5750 } 5751 } 5752 5753 w -= 4; 5754 dst += 4; 5755 } 5756 5757 while (w) 5758 { 5759 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5760 5761 if (pix1) 5762 { 5763 pix2 = *dst; 5764 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); 5765 } 5766 5767 w--; 5768 dst++; 5769 } 5770 } 5771 5772 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, 5773 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5774 uint32_t, uint32_t, uint32_t, 5775 COVER, FLAG_NONE) 5776 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, 5777 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5778 uint32_t, uint32_t, uint32_t, 5779 PAD, FLAG_NONE) 5780 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, 5781 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5782 uint32_t, uint32_t, uint32_t, 5783 NONE, FLAG_NONE) 5784 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, 5785 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5786 uint32_t, uint32_t, uint32_t, 5787 NORMAL, FLAG_NONE) 5788 5789 static force_inline void 5790 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, 5791 const uint8_t * mask, 5792 const uint32_t * src_top, 5793 const uint32_t * src_bottom, 5794 int32_t w, 5795 int wt, 5796 int wb, 5797 pixman_fixed_t vx, 5798 pixman_fixed_t unit_x, 5799 pixman_fixed_t max_vx, 5800 pixman_bool_t zero_src) 5801 { 5802 BILINEAR_DECLARE_VARIABLES; 5803 uint32_t pix1, pix2, pix3, pix4; 5804 uint32_t m; 5805 5806 while (w && ((uintptr_t)dst & 15)) 5807 { 5808 uint32_t sa; 5809 5810 m = (uint32_t) *mask++; 5811 5812 if (m) 5813 { 5814 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5815 sa = pix1 >> 24; 5816 5817 if (sa == 0xff && m == 0xff) 5818 { 5819 *dst = pix1; 5820 } 5821 else 5822 { 5823 __m128i ms, md, ma, msa; 5824 5825 pix2 = *dst; 5826 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5827 ms = unpack_32_1x128 (pix1); 5828 md = unpack_32_1x128 (pix2); 5829 5830 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5831 5832 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5833 } 5834 } 5835 else 5836 { 5837 BILINEAR_SKIP_ONE_PIXEL (); 5838 } 5839 5840 w--; 5841 dst++; 5842 } 5843 5844 while (w >= 4) 5845 { 5846 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 5847 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5848 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 5849 5850 m = *(uint32_t*)mask; 5851 5852 if (m) 5853 { 5854 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5855 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5856 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 5857 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 5858 5859 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); 5860 5861 if (m == 0xffffffff && is_opaque (xmm_src)) 5862 { 5863 save_128_aligned ((__m128i *)dst, xmm_src); 5864 } 5865 else 5866 { 5867 xmm_dst = load_128_aligned ((__m128i *)dst); 5868 5869 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 5870 5871 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5872 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 5873 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5874 5875 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 5876 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 5877 5878 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 5879 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 5880 5881 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5882 } 5883 } 5884 else 5885 { 5886 BILINEAR_SKIP_ONE_PIXEL (); 5887 BILINEAR_SKIP_ONE_PIXEL (); 5888 BILINEAR_SKIP_ONE_PIXEL (); 5889 BILINEAR_SKIP_ONE_PIXEL (); 5890 } 5891 5892 w -= 4; 5893 dst += 4; 5894 mask += 4; 5895 } 5896 5897 while (w) 5898 { 5899 uint32_t sa; 5900 5901 m = (uint32_t) *mask++; 5902 5903 if (m) 5904 { 5905 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5906 sa = pix1 >> 24; 5907 5908 if (sa == 0xff && m == 0xff) 5909 { 5910 *dst = pix1; 5911 } 5912 else 5913 { 5914 __m128i ms, md, ma, msa; 5915 5916 pix2 = *dst; 5917 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5918 ms = unpack_32_1x128 (pix1); 5919 md = unpack_32_1x128 (pix2); 5920 5921 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5922 5923 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5924 } 5925 } 5926 else 5927 { 5928 BILINEAR_SKIP_ONE_PIXEL (); 5929 } 5930 5931 w--; 5932 dst++; 5933 } 5934 } 5935 5936 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, 5937 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 5938 uint32_t, uint8_t, uint32_t, 5939 COVER, FLAG_HAVE_NON_SOLID_MASK) 5940 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, 5941 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 5942 uint32_t, uint8_t, uint32_t, 5943 PAD, FLAG_HAVE_NON_SOLID_MASK) 5944 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, 5945 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 5946 uint32_t, uint8_t, uint32_t, 5947 NONE, FLAG_HAVE_NON_SOLID_MASK) 5948 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, 5949 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 5950 uint32_t, uint8_t, uint32_t, 5951 NORMAL, FLAG_HAVE_NON_SOLID_MASK) 5952 5953 static force_inline void 5954 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, 5955 const uint32_t * mask, 5956 const uint32_t * src_top, 5957 const uint32_t * src_bottom, 5958 int32_t w, 5959 int wt, 5960 int wb, 5961 pixman_fixed_t vx, 5962 pixman_fixed_t unit_x, 5963 pixman_fixed_t max_vx, 5964 pixman_bool_t zero_src) 5965 { 5966 BILINEAR_DECLARE_VARIABLES; 5967 uint32_t pix1, pix2, pix3, pix4; 5968 __m128i xmm_mask; 5969 5970 if (zero_src || (*mask >> 24) == 0) 5971 return; 5972 5973 xmm_mask = create_mask_16_128 (*mask >> 24); 5974 5975 while (w && ((uintptr_t)dst & 15)) 5976 { 5977 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5978 if (pix1) 5979 { 5980 uint32_t d = *dst; 5981 5982 __m128i ms = unpack_32_1x128 (pix1); 5983 __m128i alpha = expand_alpha_1x128 (ms); 5984 __m128i dest = xmm_mask; 5985 __m128i alpha_dst = unpack_32_1x128 (d); 5986 5987 *dst = pack_1x128_32 5988 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 5989 } 5990 5991 dst++; 5992 w--; 5993 } 5994 5995 while (w >= 4) 5996 { 5997 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5998 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5999 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 6000 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 6001 6002 if (pix1 | pix2 | pix3 | pix4) 6003 { 6004 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 6005 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 6006 __m128i xmm_alpha_lo, xmm_alpha_hi; 6007 6008 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); 6009 6010 xmm_dst = load_128_aligned ((__m128i*)dst); 6011 6012 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 6013 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 6014 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 6015 &xmm_alpha_lo, &xmm_alpha_hi); 6016 6017 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 6018 &xmm_alpha_lo, &xmm_alpha_hi, 6019 &xmm_mask, &xmm_mask, 6020 &xmm_dst_lo, &xmm_dst_hi); 6021 6022 save_128_aligned 6023 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 6024 } 6025 6026 dst += 4; 6027 w -= 4; 6028 } 6029 6030 while (w) 6031 { 6032 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 6033 if (pix1) 6034 { 6035 uint32_t d = *dst; 6036 6037 __m128i ms = unpack_32_1x128 (pix1); 6038 __m128i alpha = expand_alpha_1x128 (ms); 6039 __m128i dest = xmm_mask; 6040 __m128i alpha_dst = unpack_32_1x128 (d); 6041 6042 *dst = pack_1x128_32 6043 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 6044 } 6045 6046 dst++; 6047 w--; 6048 } 6049 } 6050 6051 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, 6052 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6053 uint32_t, uint32_t, uint32_t, 6054 COVER, FLAG_HAVE_SOLID_MASK) 6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, 6056 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6057 uint32_t, uint32_t, uint32_t, 6058 PAD, FLAG_HAVE_SOLID_MASK) 6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, 6060 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6061 uint32_t, uint32_t, uint32_t, 6062 NONE, FLAG_HAVE_SOLID_MASK) 6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, 6064 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6065 uint32_t, uint32_t, uint32_t, 6066 NORMAL, FLAG_HAVE_SOLID_MASK) 6067 6068 static const pixman_fast_path_t sse2_fast_paths[] = 6069 { 6070 /* PIXMAN_OP_OVER */ 6071 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), 6072 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), 6073 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), 6074 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), 6075 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), 6076 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), 6077 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), 6078 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), 6079 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), 6080 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), 6081 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), 6082 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), 6083 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), 6084 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), 6085 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), 6086 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), 6087 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), 6088 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), 6089 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), 6090 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), 6091 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), 6092 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), 6093 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), 6094 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), 6095 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), 6096 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), 6097 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), 6098 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), 6099 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), 6100 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), 6101 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), 6102 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), 6103 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), 6104 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), 6105 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), 6106 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), 6107 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), 6108 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), 6109 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), 6110 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), 6111 PIXMAN_STD_FAST_PATH<